summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPavan Deolasee2014-09-01 13:07:30 +0000
committerPavan Deolasee2014-09-01 13:07:30 +0000
commit8642a0b6269c9d7212a968441266ebc64e90ded8 (patch)
tree1fdea16f55446329c613431af6c4e836a9348afb
parentfd159b3983473599768ca36ed8e4b8bfa1ed1969 (diff)
The Postgres-XL functionality includes MPP parallelism with
data node to data node communication, more stringent security, and other performance enhancements. Please see release notes. Key contributors are: Andrei Martsinchyk Nikhil Sontakke Mason Sharp
-rw-r--r--.gitignore5
-rw-r--r--COPYRIGHT420
-rw-r--r--LICENSE.txt373
-rw-r--r--README22
-rwxr-xr-xconfigure240
-rw-r--r--configure.in2
-rw-r--r--contrib/Makefile3
-rw-r--r--contrib/pgxc_ctl/pgxc_ctl_bash_conf_part0
-rw-r--r--contrib/stormstats/Makefile15
-rw-r--r--contrib/stormstats/stormstats--1.0.sql17
-rw-r--r--contrib/stormstats/stormstats--unpackaged--1.0.sql5
-rw-r--r--contrib/stormstats/stormstats.c898
-rw-r--r--contrib/stormstats/stormstats.control5
-rw-r--r--contrib/stormstats/stormstats.h9
-rw-r--r--src/backend/Makefile13
-rw-r--r--src/backend/access/common/heaptuple.c107
-rw-r--r--src/backend/access/common/printtup.c13
-rw-r--r--src/backend/access/transam/clog.c16
-rw-r--r--src/backend/access/transam/gtm.c163
-rw-r--r--src/backend/access/transam/varsup.c69
-rw-r--r--src/backend/access/transam/xact.c174
-rw-r--r--src/backend/bootstrap/bootstrap.c5
-rw-r--r--src/backend/catalog/Makefile1
-rw-r--r--src/backend/catalog/catalog.c10
-rw-r--r--src/backend/catalog/dependency.c9
-rw-r--r--src/backend/catalog/genbki.pl2
-rw-r--r--src/backend/catalog/heap.c7
-rw-r--r--src/backend/catalog/namespace.c214
-rw-r--r--src/backend/catalog/pg_aggregate.c44
-rw-r--r--src/backend/catalog/pg_proc.c8
-rw-r--r--src/backend/catalog/storage.c10
-rw-r--r--src/backend/catalog/storm_catalog.sql307
-rw-r--r--src/backend/commands/aggregatecmds.c54
-rw-r--r--src/backend/commands/analyze.c468
-rw-r--r--src/backend/commands/copy.c121
-rw-r--r--src/backend/commands/dbcommands.c3
-rw-r--r--src/backend/commands/explain.c124
-rw-r--r--src/backend/commands/indexcmds.c87
-rw-r--r--src/backend/commands/portalcmds.c100
-rw-r--r--src/backend/commands/prepare.c6
-rw-r--r--src/backend/commands/schemacmds.c12
-rw-r--r--src/backend/commands/sequence.c158
-rw-r--r--src/backend/commands/tablecmds.c174
-rw-r--r--src/backend/commands/trigger.c278
-rw-r--r--src/backend/commands/vacuum.c302
-rw-r--r--src/backend/commands/variable.c128
-rw-r--r--src/backend/commands/view.c15
-rw-r--r--src/backend/executor/Makefile2
-rw-r--r--src/backend/executor/execAmi.c11
-rw-r--r--src/backend/executor/execCurrent.c7
-rw-r--r--src/backend/executor/execMain.c67
-rw-r--r--src/backend/executor/execProcnode.c94
-rw-r--r--src/backend/executor/execTuples.c301
-rw-r--r--src/backend/executor/execUtils.c9
-rw-r--r--src/backend/executor/functions.c18
-rw-r--r--src/backend/executor/nodeAgg.c171
-rw-r--r--src/backend/executor/nodeModifyTable.c213
-rw-r--r--src/backend/executor/nodeSort.c46
-rw-r--r--src/backend/executor/nodeSubplan.c10
-rw-r--r--src/backend/executor/nodeWindowAgg.c29
-rw-r--r--src/backend/executor/producerReceiver.c290
-rw-r--r--src/backend/libpq/be-fsstubs.c105
-rw-r--r--src/backend/libpq/hba.c93
-rw-r--r--src/backend/main/main.c1
-rw-r--r--src/backend/nodes/copyfuncs.c135
-rw-r--r--src/backend/nodes/equalfuncs.c46
-rw-r--r--src/backend/nodes/outfuncs.c1052
-rw-r--r--src/backend/nodes/readfuncs.c2113
-rw-r--r--src/backend/optimizer/path/Makefile3
-rw-r--r--src/backend/optimizer/path/allpaths.c85
-rw-r--r--src/backend/optimizer/path/costsize.c64
-rw-r--r--src/backend/optimizer/path/joinpath.c11
-rw-r--r--src/backend/optimizer/plan/Makefile2
-rw-r--r--src/backend/optimizer/plan/createplan.c3646
-rw-r--r--src/backend/optimizer/plan/planagg.c15
-rw-r--r--src/backend/optimizer/plan/planner.c547
-rw-r--r--src/backend/optimizer/plan/setrefs.c468
-rw-r--r--src/backend/optimizer/plan/subselect.c42
-rw-r--r--src/backend/optimizer/prep/preptlist.c126
-rw-r--r--src/backend/optimizer/prep/prepunion.c15
-rw-r--r--src/backend/optimizer/util/Makefile2
-rw-r--r--src/backend/optimizer/util/pathnode.c1263
-rw-r--r--src/backend/optimizer/util/plancat.c31
-rw-r--r--src/backend/parser/analyze.c131
-rw-r--r--src/backend/parser/gram.y44
-rw-r--r--src/backend/parser/parse_agg.c61
-rw-r--r--src/backend/parser/parse_relation.c106
-rw-r--r--src/backend/parser/parse_utilcmd.c965
-rw-r--r--src/backend/pgxc/Makefile2
-rw-r--r--src/backend/pgxc/cluster/Makefile17
-rw-r--r--src/backend/pgxc/cluster/pause.c480
-rw-r--r--src/backend/pgxc/cluster/stormutils.c46
-rw-r--r--src/backend/pgxc/copy/remotecopy.c55
-rw-r--r--src/backend/pgxc/locator/locator.c1335
-rw-r--r--src/backend/pgxc/locator/redistrib.c125
-rw-r--r--src/backend/pgxc/nodemgr/nodemgr.c23
-rw-r--r--src/backend/pgxc/plan/Makefile19
-rw-r--r--src/backend/pgxc/plan/planner.c2282
-rw-r--r--src/backend/pgxc/pool/Makefile2
-rw-r--r--src/backend/pgxc/pool/execRemote.c5309
-rw-r--r--src/backend/pgxc/pool/pgxcnode.c827
-rw-r--r--src/backend/pgxc/pool/poolmgr.c601
-rw-r--r--src/backend/pgxc/pool/poolutils.c57
-rw-r--r--src/backend/pgxc/pool/postgresql_fdw.c132
-rw-r--r--src/backend/pgxc/squeue/Makefile19
-rw-r--r--src/backend/pgxc/squeue/squeue.c1509
-rw-r--r--src/backend/postmaster/autovacuum.c19
-rw-r--r--src/backend/postmaster/pgstat.c71
-rw-r--r--src/backend/postmaster/postmaster.c133
-rw-r--r--src/backend/rewrite/rewriteHandler.c7
-rw-r--r--src/backend/storage/buffer/bufmgr.c9
-rw-r--r--src/backend/storage/ipc/ipci.c27
-rw-r--r--src/backend/storage/ipc/procarray.c485
-rw-r--r--src/backend/storage/ipc/procsignal.c5
-rw-r--r--src/backend/storage/lmgr/lock.c78
-rw-r--r--src/backend/storage/lmgr/lwlock.c17
-rw-r--r--src/backend/storage/lmgr/proc.c13
-rw-r--r--src/backend/tcop/dest.c13
-rw-r--r--src/backend/tcop/postgres.c416
-rw-r--r--src/backend/tcop/pquery.c762
-rw-r--r--src/backend/tcop/utility.c347
-rw-r--r--src/backend/utils/adt/arrayfuncs.c41
-rw-r--r--src/backend/utils/adt/date.c16
-rw-r--r--src/backend/utils/adt/dbsize.c74
-rw-r--r--src/backend/utils/adt/lockfuncs.c127
-rw-r--r--src/backend/utils/adt/pseudotypes.c70
-rw-r--r--src/backend/utils/adt/ri_triggers.c16
-rw-r--r--src/backend/utils/adt/ruleutils.c334
-rw-r--r--src/backend/utils/adt/version.c7
-rw-r--r--src/backend/utils/cache/inval.c19
-rw-r--r--src/backend/utils/cache/lsyscache.c230
-rw-r--r--src/backend/utils/cache/plancache.c128
-rw-r--r--src/backend/utils/cache/relcache.c24
-rw-r--r--src/backend/utils/errcodes.txt1
-rw-r--r--src/backend/utils/init/globals.c13
-rw-r--r--src/backend/utils/init/miscinit.c122
-rw-r--r--src/backend/utils/init/postinit.c11
-rw-r--r--src/backend/utils/misc/guc.c384
-rw-r--r--src/backend/utils/misc/postgresql.conf.sample25
-rw-r--r--src/backend/utils/mmgr/portalmem.c107
-rw-r--r--src/backend/utils/sort/tuplesort.c208
-rw-r--r--src/backend/utils/sort/tuplestore.c356
-rw-r--r--src/backend/utils/time/combocid.c37
-rw-r--r--src/backend/utils/time/snapmgr.c9
-rw-r--r--src/bin/Makefile2
-rw-r--r--src/bin/initdb/initdb.c97
-rw-r--r--src/bin/initgtm/initgtm.c8
-rw-r--r--src/bin/pg_basebackup/streamutil.c2
-rw-r--r--src/bin/pg_ctl/pg_ctl.c18
-rw-r--r--src/bin/pg_dump/pg_dump.c165
-rw-r--r--src/bin/pg_dump/pg_dump.h1
-rw-r--r--src/bin/pg_dump/pg_dumpall.c105
-rw-r--r--src/bin/pg_resetxlog/po/sv.po463
-rw-r--r--src/bin/psql/command.c4
-rw-r--r--src/bin/psql/startup.c4
-rw-r--r--src/bin/psql/tab-complete.c15
-rw-r--r--src/gtm/Makefile35
-rw-r--r--src/gtm/client/Makefile18
-rw-r--r--src/gtm/client/fe-connect.c38
-rw-r--r--src/gtm/client/fe-protocol.c47
-rw-r--r--src/gtm/client/gtm_client.c300
-rw-r--r--src/gtm/client/strlcpy.c72
-rw-r--r--src/gtm/common/.gitignore1
-rw-r--r--src/gtm/common/Makefile37
-rw-r--r--src/gtm/common/gtm_opt_handler.c3509
-rw-r--r--src/gtm/common/gtm_opt_scanner.l92
-rw-r--r--src/gtm/common/gtm_serialize.c147
-rw-r--r--src/gtm/common/gtm_utils.c2
-rw-r--r--src/gtm/gtm_ctl/.gitignore1
-rw-r--r--src/gtm/gtm_ctl/Makefile34
-rw-r--r--src/gtm/gtm_ctl/gtm_ctl.c1317
-rw-r--r--src/gtm/libpq/Makefile14
-rw-r--r--src/gtm/libpq/pqcomm.c6
-rw-r--r--src/gtm/main/Makefile39
-rw-r--r--src/gtm/main/gtm_opt.c2
-rw-r--r--src/gtm/main/gtm_seq.c745
-rw-r--r--src/gtm/main/gtm_snap.c6
-rw-r--r--src/gtm/main/gtm_standby.c115
-rw-r--r--src/gtm/main/gtm_thread.c12
-rw-r--r--src/gtm/main/gtm_txn.c208
-rw-r--r--src/gtm/main/main.c94
-rw-r--r--src/gtm/path/Makefile15
-rw-r--r--src/gtm/proxy/Makefile39
-rw-r--r--src/gtm/proxy/gtm_proxy_opt.c4
-rw-r--r--src/gtm/proxy/proxy_main.c128
-rw-r--r--src/gtm/proxy/proxy_thread.c2
-rw-r--r--src/gtm/recovery/Makefile16
-rw-r--r--src/gtm/recovery/register_common.c120
-rw-r--r--src/gtm/recovery/register_gtm.c597
-rw-r--r--src/gtm/recovery/replication.c129
-rw-r--r--src/include/Makefile3
-rw-r--r--src/include/access/gtm.h13
-rw-r--r--src/include/access/htup.h21
-rw-r--r--src/include/access/transam.h9
-rw-r--r--src/include/access/xact.h7
-rw-r--r--src/include/bootstrap/bootstrap.h3
-rw-r--r--src/include/catalog/catalog.h10
-rw-r--r--src/include/catalog/namespace.h8
-rw-r--r--src/include/catalog/pg_aggregate.h264
-rw-r--r--src/include/catalog/pg_namespace.h10
-rw-r--r--src/include/catalog/pg_proc.h15
-rw-r--r--src/include/catalog/pg_type.h8
-rw-r--r--src/include/commands/sequence.h9
-rw-r--r--src/include/commands/tablecmds.h4
-rw-r--r--src/include/commands/trigger.h5
-rw-r--r--src/include/commands/vacuum.h9
-rw-r--r--src/include/commands/variable.h9
-rw-r--r--src/include/executor/execdesc.h16
-rw-r--r--src/include/executor/executor.h12
-rw-r--r--src/include/executor/producerReceiver.h33
-rw-r--r--src/include/executor/tuptable.h24
-rw-r--r--src/include/gtm/gtm.h9
-rw-r--r--src/include/gtm/gtm_c.h2
-rw-r--r--src/include/gtm/gtm_client.h38
-rw-r--r--src/include/gtm/gtm_msg.h13
-rw-r--r--src/include/gtm/gtm_seq.h38
-rw-r--r--src/include/gtm/gtm_serialize.h6
-rw-r--r--src/include/gtm/gtm_standby.h5
-rw-r--r--src/include/gtm/gtm_txn.h6
-rw-r--r--src/include/gtm/register.h25
-rw-r--r--src/include/libpq/hba.h3
-rw-r--r--src/include/miscadmin.h10
-rw-r--r--src/include/nodes/execnodes.h20
-rw-r--r--src/include/nodes/nodes.h31
-rw-r--r--src/include/nodes/params.h8
-rw-r--r--src/include/nodes/parsenodes.h24
-rw-r--r--src/include/nodes/plannodes.h45
-rw-r--r--src/include/nodes/primnodes.h7
-rw-r--r--src/include/nodes/relation.h87
-rw-r--r--src/include/optimizer/cost.h23
-rw-r--r--src/include/optimizer/pathnode.h17
-rw-r--r--src/include/optimizer/paths.h13
-rw-r--r--src/include/optimizer/planmain.h37
-rw-r--r--src/include/optimizer/planner.h4
-rw-r--r--src/include/parser/analyze.h8
-rw-r--r--src/include/parser/kwlist.h6
-rw-r--r--src/include/parser/parse_agg.h14
-rw-r--r--src/include/parser/parse_utilcmd.h12
-rw-r--r--src/include/pg_config.h.win328
-rw-r--r--src/include/pgstat.h10
-rw-r--r--src/include/pgxc/execRemote.h241
-rw-r--r--src/include/pgxc/locator.h153
-rw-r--r--src/include/pgxc/pause.h38
-rw-r--r--src/include/pgxc/pgxc.h23
-rw-r--r--src/include/pgxc/pgxcnode.h59
-rw-r--r--src/include/pgxc/planner.h236
-rw-r--r--src/include/pgxc/poolmgr.h38
-rw-r--r--src/include/pgxc/postgresql_fdw.h23
-rw-r--r--src/include/pgxc/remotecopy.h11
-rw-r--r--src/include/pgxc/squeue.h60
-rw-r--r--src/include/storage/backendid.h20
-rw-r--r--src/include/storage/lwlock.h8
-rw-r--r--src/include/storage/proc.h11
-rw-r--r--src/include/storage/procarray.h10
-rw-r--r--src/include/storage/procsignal.h5
-rw-r--r--src/include/storage/smgr.h11
-rw-r--r--src/include/tcop/dest.h9
-rw-r--r--src/include/tcop/pquery.h10
-rw-r--r--src/include/tcop/utility.h4
-rw-r--r--src/include/utils/builtins.h19
-rw-r--r--src/include/utils/guc.h9
-rw-r--r--src/include/utils/lsyscache.h19
-rw-r--r--src/include/utils/plancache.h9
-rw-r--r--src/include/utils/portal.h19
-rw-r--r--src/include/utils/rel.h54
-rw-r--r--src/include/utils/tuplesort.h9
-rw-r--r--src/include/utils/tuplestore.h15
-rw-r--r--src/pl/plperl/expected/plperl_lc.out23
-rw-r--r--src/pl/plperl/expected/plperl_lc_1.out31
-rw-r--r--src/pl/plperl/sql/plperl_lc.sql16
-rw-r--r--src/pl/plpgsql/src/pl_exec.c11
271 files changed, 42562 insertions, 3745 deletions
diff --git a/.gitignore b/.gitignore
index 1e15ce5fc1..689ac5bee9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,7 @@ lcov.info
win32ver.rc
*.exe
lib*dll.def
+*~
# Local excludes in root directory
/GNUmakefile
@@ -30,3 +31,7 @@ lib*dll.def
/pgsql.sln.cache
/Debug/
/Release/
+/StormDB*
+/cscope*
+/.gitignore
+
diff --git a/COPYRIGHT b/COPYRIGHT
index 345ee6ec3b..fafaa1e836 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -1,47 +1,381 @@
-Postgres-XC Cluster Database Management System
+Postgres-XL Cluster Database Management System
-Portions Copyright (c) 2010-2012, Postgres-XC Development Group
-Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+Portions Copyright (c) 2012-2014, TransLattice, Inc.
+Portions Copyright (c) 2010-2013, Postgres-XC Development Group
+Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
Portions Copyright (c) 1994, The Regents of the University of California
-Permission to use, copy, modify, and distribute this software and its
-documentation for any purpose, without fee, and without a written agreement
-is hereby granted, provided that the above copyright notice and this
-paragraph and the following two paragraphs appear in all copies.
-
-IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
-DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
-LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS
-DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGE.
-
-THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
-AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
-ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
-PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
-
-IN NO EVENT SHALL POSTGRESQL GLOBAL DEVELOPMENT GROUP BE LIABLE TO ANY
-PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL
-DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS
-SOFTWARE AND ITS DOCUMENTATION, EVEN IF POSTGRESQL GLOBAL DEVELOPMENT
-GROUP HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-POSTGRESQL GLOBAL DEVELOPMENT GROUP SPECIFICALLY DISCLAIMS ANY WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
-AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
-ON AN "AS IS" BASIS, AND THE POSTGRESQL GLOBAL DEVELOPMENT GROUP HAS NO OBLIGATIONS TO
-PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
-
-IN NO EVENT SHALL POSTGRES-XC DEVELOPMENT GROUP BE LIABLE TO ANY
-PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL
-DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS
-SOFTWARE AND ITS DOCUMENTATION, EVEN IF POSTGRES-XC DEVELOPMENT
-GROUP HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-POSTGRES-XC DEVELOPMENT GROUP SPECIFICALLY DISCLAIMS ANY WARRANTIES,
-INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
-AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
-ON AN "AS IS" BASIS, AND THE POSTGRES-XC DEVELOPMENT GROUP HAS NO OBLIGATIONS TO
-PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+Mozilla Public License Version 2.0
+==================================
+
+1. Definitions
+--------------
+
+1.1. "Contributor"
+ means each individual or legal entity that creates, contributes to
+ the creation of, or owns Covered Software.
+
+1.2. "Contributor Version"
+ means the combination of the Contributions of others (if any) used
+ by a Contributor and that particular Contributor's Contribution.
+
+1.3. "Contribution"
+ means Covered Software of a particular Contributor.
+
+1.4. "Covered Software"
+ means Source Code Form to which the initial Contributor has attached
+ the notice in Exhibit A, the Executable Form of such Source Code
+ Form, and Modifications of such Source Code Form, in each case
+ including portions thereof.
+
+1.5. "Incompatible With Secondary Licenses"
+ means
+
+ (a) that the initial Contributor has attached the notice described
+ in Exhibit B to the Covered Software; or
+
+ (b) that the Covered Software was made available under the terms of
+ version 1.1 or earlier of the License, but not also under the
+ terms of a Secondary License.
+
+1.6. "Executable Form"
+ means any form of the work other than Source Code Form.
+
+1.7. "Larger Work"
+ means a work that combines Covered Software with other material, in
+ a separate file or files, that is not Covered Software.
+
+1.8. "License"
+ means this document.
+
+1.9. "Licensable"
+ means having the right to grant, to the maximum extent possible,
+ whether at the time of the initial grant or subsequently, any and
+ all of the rights conveyed by this License.
+
+1.10. "Modifications"
+ means any of the following:
+
+ (a) any file in Source Code Form that results from an addition to,
+ deletion from, or modification of the contents of Covered
+ Software; or
+
+ (b) any new file in Source Code Form that contains any Covered
+ Software.
+
+1.11. "Patent Claims" of a Contributor
+ means any patent claim(s), including without limitation, method,
+ process, and apparatus claims, in any patent Licensable by such
+ Contributor that would be infringed, but for the grant of the
+ License, by the making, using, selling, offering for sale, having
+ made, import, or transfer of either its Contributions or its
+ Contributor Version.
+
+1.12. "Secondary License"
+ means either the GNU General Public License, Version 2.0, the GNU
+ Lesser General Public License, Version 2.1, the GNU Affero General
+ Public License, Version 3.0, or any later versions of those
+ licenses.
+
+1.13. "Source Code Form"
+ means the form of the work preferred for making modifications.
+
+1.14. "You" (or "Your")
+ means an individual or a legal entity exercising rights under this
+ License. For legal entities, "You" includes any entity that
+ controls, is controlled by, or is under common control with You. For
+ purposes of this definition, "control" means (a) the power, direct
+ or indirect, to cause the direction or management of such entity,
+ whether by contract or otherwise, or (b) ownership of more than
+ fifty percent (50%) of the outstanding shares or beneficial
+ ownership of such entity.
+
+2. License Grants and Conditions
+--------------------------------
+
+2.1. Grants
+
+Each Contributor hereby grants You a world-wide, royalty-free,
+non-exclusive license:
+
+(a) under intellectual property rights (other than patent or trademark)
+ Licensable by such Contributor to use, reproduce, make available,
+ modify, display, perform, distribute, and otherwise exploit its
+ Contributions, either on an unmodified basis, with Modifications, or
+ as part of a Larger Work; and
+
+(b) under Patent Claims of such Contributor to make, use, sell, offer
+ for sale, have made, import, and otherwise transfer either its
+ Contributions or its Contributor Version.
+
+2.2. Effective Date
+
+The licenses granted in Section 2.1 with respect to any Contribution
+become effective for each Contribution on the date the Contributor first
+distributes such Contribution.
+
+2.3. Limitations on Grant Scope
+
+The licenses granted in this Section 2 are the only rights granted under
+this License. No additional rights or licenses will be implied from the
+distribution or licensing of Covered Software under this License.
+Notwithstanding Section 2.1(b) above, no patent license is granted by a
+Contributor:
+
+(a) for any code that a Contributor has removed from Covered Software;
+ or
+
+(b) for infringements caused by: (i) Your and any other third party's
+ modifications of Covered Software, or (ii) the combination of its
+ Contributions with other software (except as part of its Contributor
+ Version); or
+
+(c) under Patent Claims infringed by Covered Software in the absence of
+ its Contributions.
+
+This License does not grant any rights in the trademarks, service marks,
+or logos of any Contributor (except as may be necessary to comply with
+the notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+No Contributor makes additional grants as a result of Your choice to
+distribute the Covered Software under a subsequent version of this
+License (see Section 10.2) or under the terms of a Secondary License (if
+permitted under the terms of Section 3.3).
+
+2.5. Representation
+
+Each Contributor represents that the Contributor believes its
+Contributions are its original creation(s) or it has sufficient rights
+to grant the rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+This License is not intended to limit any rights You have under
+applicable copyright doctrines of fair use, fair dealing, or other
+equivalents.
+
+2.7. Conditions
+
+Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
+in Section 2.1.
+
+3. Responsibilities
+-------------------
+
+3.1. Distribution of Source Form
+
+All distribution of Covered Software in Source Code Form, including any
+Modifications that You create or to which You contribute, must be under
+the terms of this License. You must inform recipients that the Source
+Code Form of the Covered Software is governed by the terms of this
+License, and how they can obtain a copy of this License. You may not
+attempt to alter or restrict the recipients' rights in the Source Code
+Form.
+
+3.2. Distribution of Executable Form
+
+If You distribute Covered Software in Executable Form then:
+
+(a) such Covered Software must also be made available in Source Code
+ Form, as described in Section 3.1, and You must inform recipients of
+ the Executable Form how they can obtain a copy of such Source Code
+ Form by reasonable means in a timely manner, at a charge no more
+ than the cost of distribution to the recipient; and
+
+(b) You may distribute such Executable Form under the terms of this
+ License, or sublicense it under different terms, provided that the
+ license for the Executable Form does not attempt to limit or alter
+ the recipients' rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+You may create and distribute a Larger Work under terms of Your choice,
+provided that You also comply with the requirements of this License for
+the Covered Software. If the Larger Work is a combination of Covered
+Software with a work governed by one or more Secondary Licenses, and the
+Covered Software is not Incompatible With Secondary Licenses, this
+License permits You to additionally distribute such Covered Software
+under the terms of such Secondary License(s), so that the recipient of
+the Larger Work may, at their option, further distribute the Covered
+Software under the terms of either this License or such Secondary
+License(s).
+
+3.4. Notices
+
+You may not remove or alter the substance of any license notices
+(including copyright notices, patent notices, disclaimers of warranty,
+or limitations of liability) contained within the Source Code Form of
+the Covered Software, except that You may alter any license notices to
+the extent required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+You may choose to offer, and to charge a fee for, warranty, support,
+indemnity or liability obligations to one or more recipients of Covered
+Software. However, You may do so only on Your own behalf, and not on
+behalf of any Contributor. You must make it absolutely clear that any
+such warranty, support, indemnity, or liability obligation is offered by
+You alone, and You hereby agree to indemnify every Contributor for any
+liability incurred by such Contributor as a result of warranty, support,
+indemnity or liability terms You offer. You may include additional
+disclaimers of warranty and limitations of liability specific to any
+jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+---------------------------------------------------
+
+If it is impossible for You to comply with any of the terms of this
+License with respect to some or all of the Covered Software due to
+statute, judicial order, or regulation then You must: (a) comply with
+the terms of this License to the maximum extent possible; and (b)
+describe the limitations and the code they affect. Such description must
+be placed in a text file included with all distributions of the Covered
+Software under this License. Except to the extent prohibited by statute
+or regulation, such description must be sufficiently detailed for a
+recipient of ordinary skill to be able to understand it.
+
+5. Termination
+--------------
+
+5.1. The rights granted under this License will terminate automatically
+if You fail to comply with any of its terms. However, if You become
+compliant, then the rights granted under this License from a particular
+Contributor are reinstated (a) provisionally, unless and until such
+Contributor explicitly and finally terminates Your grants, and (b) on an
+ongoing basis, if such Contributor fails to notify You of the
+non-compliance by some reasonable means prior to 60 days after You have
+come back into compliance. Moreover, Your grants from a particular
+Contributor are reinstated on an ongoing basis if such Contributor
+notifies You of the non-compliance by some reasonable means, this is the
+first time You have received notice of non-compliance with this License
+from such Contributor, and You become compliant prior to 30 days after
+Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+infringement claim (excluding declaratory judgment actions,
+counter-claims, and cross-claims) alleging that a Contributor Version
+directly or indirectly infringes any patent, then the rights granted to
+You by any and all Contributors for the Covered Software under Section
+2.1 of this License shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all
+end user license agreements (excluding distributors and resellers) which
+have been validly granted by You or Your distributors under this License
+prior to termination shall survive termination.
+
+************************************************************************
+* *
+* 6. Disclaimer of Warranty *
+* ------------------------- *
+* *
+* Covered Software is provided under this License on an "as is" *
+* basis, without warranty of any kind, either expressed, implied, or *
+* statutory, including, without limitation, warranties that the *
+* Covered Software is free of defects, merchantable, fit for a *
+* particular purpose or non-infringing. The entire risk as to the *
+* quality and performance of the Covered Software is with You. *
+* Should any Covered Software prove defective in any respect, You *
+* (not any Contributor) assume the cost of any necessary servicing, *
+* repair, or correction. This disclaimer of warranty constitutes an *
+* essential part of this License. No use of any Covered Software is *
+* authorized under this License except under this disclaimer. *
+* *
+************************************************************************
+
+************************************************************************
+* *
+* 7. Limitation of Liability *
+* -------------------------- *
+* *
+* Under no circumstances and under no legal theory, whether tort *
+* (including negligence), contract, or otherwise, shall any *
+* Contributor, or anyone who distributes Covered Software as *
+* permitted above, be liable to You for any direct, indirect, *
+* special, incidental, or consequential damages of any character *
+* including, without limitation, damages for lost profits, loss of *
+* goodwill, work stoppage, computer failure or malfunction, or any *
+* and all other commercial damages or losses, even if such party *
+* shall have been informed of the possibility of such damages. This *
+* limitation of liability shall not apply to liability for death or *
+* personal injury resulting from such party's negligence to the *
+* extent applicable law prohibits such limitation. Some *
+* jurisdictions do not allow the exclusion or limitation of *
+* incidental or consequential damages, so this exclusion and *
+* limitation may not apply to You. *
+* *
+************************************************************************
+
+8. Litigation
+-------------
+
+Any litigation relating to this License may be brought only in the
+courts of a jurisdiction where the defendant maintains its principal
+place of business and such litigation shall be governed by laws of that
+jurisdiction, without reference to its conflict-of-law provisions.
+Nothing in this Section shall prevent a party's ability to bring
+cross-claims or counter-claims.
+
+9. Miscellaneous
+----------------
+
+This License represents the complete agreement concerning the subject
+matter hereof. If any provision of this License is held to be
+unenforceable, such provision shall be reformed only to the extent
+necessary to make it enforceable. Any law or regulation which provides
+that the language of a contract shall be construed against the drafter
+shall not be used to construe this License against a Contributor.
+
+10. Versions of the License
+---------------------------
+
+10.1. New Versions
+
+Mozilla Foundation is the license steward. Except as provided in Section
+10.3, no one other than the license steward has the right to modify or
+publish new versions of this License. Each version will be given a
+distinguishing version number.
+
+10.2. Effect of New Versions
+
+You may distribute the Covered Software under the terms of the version
+of the License under which You originally received the Covered Software,
+or under the terms of any subsequent version published by the license
+steward.
+
+10.3. Modified Versions
+
+If you create software not governed by this License, and you want to
+create a new license for such software, you may create and use a
+modified version of this License if you rename the license and remove
+any references to the name of the license steward (except to note that
+such modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+Licenses
+
+If You choose to distribute Source Code Form that is Incompatible With
+Secondary Licenses under the terms of this version of the License, the
+notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+-------------------------------------------
+
+ This Source Code Form is subject to the terms of the Mozilla Public
+ License, v. 2.0. If a copy of the MPL was not distributed with this
+ file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular
+file, then You may include the notice in a location (such as a LICENSE
+file in a relevant directory) where a recipient would be likely to look
+for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+---------------------------------------------------------
+
+ This Source Code Form is "Incompatible With Secondary Licenses", as
+ defined by the Mozilla Public License, v. 2.0.
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000000..14e2f777f6
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,373 @@
+Mozilla Public License Version 2.0
+==================================
+
+1. Definitions
+--------------
+
+1.1. "Contributor"
+ means each individual or legal entity that creates, contributes to
+ the creation of, or owns Covered Software.
+
+1.2. "Contributor Version"
+ means the combination of the Contributions of others (if any) used
+ by a Contributor and that particular Contributor's Contribution.
+
+1.3. "Contribution"
+ means Covered Software of a particular Contributor.
+
+1.4. "Covered Software"
+ means Source Code Form to which the initial Contributor has attached
+ the notice in Exhibit A, the Executable Form of such Source Code
+ Form, and Modifications of such Source Code Form, in each case
+ including portions thereof.
+
+1.5. "Incompatible With Secondary Licenses"
+ means
+
+ (a) that the initial Contributor has attached the notice described
+ in Exhibit B to the Covered Software; or
+
+ (b) that the Covered Software was made available under the terms of
+ version 1.1 or earlier of the License, but not also under the
+ terms of a Secondary License.
+
+1.6. "Executable Form"
+ means any form of the work other than Source Code Form.
+
+1.7. "Larger Work"
+ means a work that combines Covered Software with other material, in
+ a separate file or files, that is not Covered Software.
+
+1.8. "License"
+ means this document.
+
+1.9. "Licensable"
+ means having the right to grant, to the maximum extent possible,
+ whether at the time of the initial grant or subsequently, any and
+ all of the rights conveyed by this License.
+
+1.10. "Modifications"
+ means any of the following:
+
+ (a) any file in Source Code Form that results from an addition to,
+ deletion from, or modification of the contents of Covered
+ Software; or
+
+ (b) any new file in Source Code Form that contains any Covered
+ Software.
+
+1.11. "Patent Claims" of a Contributor
+ means any patent claim(s), including without limitation, method,
+ process, and apparatus claims, in any patent Licensable by such
+ Contributor that would be infringed, but for the grant of the
+ License, by the making, using, selling, offering for sale, having
+ made, import, or transfer of either its Contributions or its
+ Contributor Version.
+
+1.12. "Secondary License"
+ means either the GNU General Public License, Version 2.0, the GNU
+ Lesser General Public License, Version 2.1, the GNU Affero General
+ Public License, Version 3.0, or any later versions of those
+ licenses.
+
+1.13. "Source Code Form"
+ means the form of the work preferred for making modifications.
+
+1.14. "You" (or "Your")
+ means an individual or a legal entity exercising rights under this
+ License. For legal entities, "You" includes any entity that
+ controls, is controlled by, or is under common control with You. For
+ purposes of this definition, "control" means (a) the power, direct
+ or indirect, to cause the direction or management of such entity,
+ whether by contract or otherwise, or (b) ownership of more than
+ fifty percent (50%) of the outstanding shares or beneficial
+ ownership of such entity.
+
+2. License Grants and Conditions
+--------------------------------
+
+2.1. Grants
+
+Each Contributor hereby grants You a world-wide, royalty-free,
+non-exclusive license:
+
+(a) under intellectual property rights (other than patent or trademark)
+ Licensable by such Contributor to use, reproduce, make available,
+ modify, display, perform, distribute, and otherwise exploit its
+ Contributions, either on an unmodified basis, with Modifications, or
+ as part of a Larger Work; and
+
+(b) under Patent Claims of such Contributor to make, use, sell, offer
+ for sale, have made, import, and otherwise transfer either its
+ Contributions or its Contributor Version.
+
+2.2. Effective Date
+
+The licenses granted in Section 2.1 with respect to any Contribution
+become effective for each Contribution on the date the Contributor first
+distributes such Contribution.
+
+2.3. Limitations on Grant Scope
+
+The licenses granted in this Section 2 are the only rights granted under
+this License. No additional rights or licenses will be implied from the
+distribution or licensing of Covered Software under this License.
+Notwithstanding Section 2.1(b) above, no patent license is granted by a
+Contributor:
+
+(a) for any code that a Contributor has removed from Covered Software;
+ or
+
+(b) for infringements caused by: (i) Your and any other third party's
+ modifications of Covered Software, or (ii) the combination of its
+ Contributions with other software (except as part of its Contributor
+ Version); or
+
+(c) under Patent Claims infringed by Covered Software in the absence of
+ its Contributions.
+
+This License does not grant any rights in the trademarks, service marks,
+or logos of any Contributor (except as may be necessary to comply with
+the notice requirements in Section 3.4).
+
+2.4. Subsequent Licenses
+
+No Contributor makes additional grants as a result of Your choice to
+distribute the Covered Software under a subsequent version of this
+License (see Section 10.2) or under the terms of a Secondary License (if
+permitted under the terms of Section 3.3).
+
+2.5. Representation
+
+Each Contributor represents that the Contributor believes its
+Contributions are its original creation(s) or it has sufficient rights
+to grant the rights to its Contributions conveyed by this License.
+
+2.6. Fair Use
+
+This License is not intended to limit any rights You have under
+applicable copyright doctrines of fair use, fair dealing, or other
+equivalents.
+
+2.7. Conditions
+
+Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
+in Section 2.1.
+
+3. Responsibilities
+-------------------
+
+3.1. Distribution of Source Form
+
+All distribution of Covered Software in Source Code Form, including any
+Modifications that You create or to which You contribute, must be under
+the terms of this License. You must inform recipients that the Source
+Code Form of the Covered Software is governed by the terms of this
+License, and how they can obtain a copy of this License. You may not
+attempt to alter or restrict the recipients' rights in the Source Code
+Form.
+
+3.2. Distribution of Executable Form
+
+If You distribute Covered Software in Executable Form then:
+
+(a) such Covered Software must also be made available in Source Code
+ Form, as described in Section 3.1, and You must inform recipients of
+ the Executable Form how they can obtain a copy of such Source Code
+ Form by reasonable means in a timely manner, at a charge no more
+ than the cost of distribution to the recipient; and
+
+(b) You may distribute such Executable Form under the terms of this
+ License, or sublicense it under different terms, provided that the
+ license for the Executable Form does not attempt to limit or alter
+ the recipients' rights in the Source Code Form under this License.
+
+3.3. Distribution of a Larger Work
+
+You may create and distribute a Larger Work under terms of Your choice,
+provided that You also comply with the requirements of this License for
+the Covered Software. If the Larger Work is a combination of Covered
+Software with a work governed by one or more Secondary Licenses, and the
+Covered Software is not Incompatible With Secondary Licenses, this
+License permits You to additionally distribute such Covered Software
+under the terms of such Secondary License(s), so that the recipient of
+the Larger Work may, at their option, further distribute the Covered
+Software under the terms of either this License or such Secondary
+License(s).
+
+3.4. Notices
+
+You may not remove or alter the substance of any license notices
+(including copyright notices, patent notices, disclaimers of warranty,
+or limitations of liability) contained within the Source Code Form of
+the Covered Software, except that You may alter any license notices to
+the extent required to remedy known factual inaccuracies.
+
+3.5. Application of Additional Terms
+
+You may choose to offer, and to charge a fee for, warranty, support,
+indemnity or liability obligations to one or more recipients of Covered
+Software. However, You may do so only on Your own behalf, and not on
+behalf of any Contributor. You must make it absolutely clear that any
+such warranty, support, indemnity, or liability obligation is offered by
+You alone, and You hereby agree to indemnify every Contributor for any
+liability incurred by such Contributor as a result of warranty, support,
+indemnity or liability terms You offer. You may include additional
+disclaimers of warranty and limitations of liability specific to any
+jurisdiction.
+
+4. Inability to Comply Due to Statute or Regulation
+---------------------------------------------------
+
+If it is impossible for You to comply with any of the terms of this
+License with respect to some or all of the Covered Software due to
+statute, judicial order, or regulation then You must: (a) comply with
+the terms of this License to the maximum extent possible; and (b)
+describe the limitations and the code they affect. Such description must
+be placed in a text file included with all distributions of the Covered
+Software under this License. Except to the extent prohibited by statute
+or regulation, such description must be sufficiently detailed for a
+recipient of ordinary skill to be able to understand it.
+
+5. Termination
+--------------
+
+5.1. The rights granted under this License will terminate automatically
+if You fail to comply with any of its terms. However, if You become
+compliant, then the rights granted under this License from a particular
+Contributor are reinstated (a) provisionally, unless and until such
+Contributor explicitly and finally terminates Your grants, and (b) on an
+ongoing basis, if such Contributor fails to notify You of the
+non-compliance by some reasonable means prior to 60 days after You have
+come back into compliance. Moreover, Your grants from a particular
+Contributor are reinstated on an ongoing basis if such Contributor
+notifies You of the non-compliance by some reasonable means, this is the
+first time You have received notice of non-compliance with this License
+from such Contributor, and You become compliant prior to 30 days after
+Your receipt of the notice.
+
+5.2. If You initiate litigation against any entity by asserting a patent
+infringement claim (excluding declaratory judgment actions,
+counter-claims, and cross-claims) alleging that a Contributor Version
+directly or indirectly infringes any patent, then the rights granted to
+You by any and all Contributors for the Covered Software under Section
+2.1 of this License shall terminate.
+
+5.3. In the event of termination under Sections 5.1 or 5.2 above, all
+end user license agreements (excluding distributors and resellers) which
+have been validly granted by You or Your distributors under this License
+prior to termination shall survive termination.
+
+************************************************************************
+* *
+* 6. Disclaimer of Warranty *
+* ------------------------- *
+* *
+* Covered Software is provided under this License on an "as is" *
+* basis, without warranty of any kind, either expressed, implied, or *
+* statutory, including, without limitation, warranties that the *
+* Covered Software is free of defects, merchantable, fit for a *
+* particular purpose or non-infringing. The entire risk as to the *
+* quality and performance of the Covered Software is with You. *
+* Should any Covered Software prove defective in any respect, You *
+* (not any Contributor) assume the cost of any necessary servicing, *
+* repair, or correction. This disclaimer of warranty constitutes an *
+* essential part of this License. No use of any Covered Software is *
+* authorized under this License except under this disclaimer. *
+* *
+************************************************************************
+
+************************************************************************
+* *
+* 7. Limitation of Liability *
+* -------------------------- *
+* *
+* Under no circumstances and under no legal theory, whether tort *
+* (including negligence), contract, or otherwise, shall any *
+* Contributor, or anyone who distributes Covered Software as *
+* permitted above, be liable to You for any direct, indirect, *
+* special, incidental, or consequential damages of any character *
+* including, without limitation, damages for lost profits, loss of *
+* goodwill, work stoppage, computer failure or malfunction, or any *
+* and all other commercial damages or losses, even if such party *
+* shall have been informed of the possibility of such damages. This *
+* limitation of liability shall not apply to liability for death or *
+* personal injury resulting from such party's negligence to the *
+* extent applicable law prohibits such limitation. Some *
+* jurisdictions do not allow the exclusion or limitation of *
+* incidental or consequential damages, so this exclusion and *
+* limitation may not apply to You. *
+* *
+************************************************************************
+
+8. Litigation
+-------------
+
+Any litigation relating to this License may be brought only in the
+courts of a jurisdiction where the defendant maintains its principal
+place of business and such litigation shall be governed by laws of that
+jurisdiction, without reference to its conflict-of-law provisions.
+Nothing in this Section shall prevent a party's ability to bring
+cross-claims or counter-claims.
+
+9. Miscellaneous
+----------------
+
+This License represents the complete agreement concerning the subject
+matter hereof. If any provision of this License is held to be
+unenforceable, such provision shall be reformed only to the extent
+necessary to make it enforceable. Any law or regulation which provides
+that the language of a contract shall be construed against the drafter
+shall not be used to construe this License against a Contributor.
+
+10. Versions of the License
+---------------------------
+
+10.1. New Versions
+
+Mozilla Foundation is the license steward. Except as provided in Section
+10.3, no one other than the license steward has the right to modify or
+publish new versions of this License. Each version will be given a
+distinguishing version number.
+
+10.2. Effect of New Versions
+
+You may distribute the Covered Software under the terms of the version
+of the License under which You originally received the Covered Software,
+or under the terms of any subsequent version published by the license
+steward.
+
+10.3. Modified Versions
+
+If you create software not governed by this License, and you want to
+create a new license for such software, you may create and use a
+modified version of this License if you rename the license and remove
+any references to the name of the license steward (except to note that
+such modified license differs from this License).
+
+10.4. Distributing Source Code Form that is Incompatible With Secondary
+Licenses
+
+If You choose to distribute Source Code Form that is Incompatible With
+Secondary Licenses under the terms of this version of the License, the
+notice described in Exhibit B of this License must be attached.
+
+Exhibit A - Source Code Form License Notice
+-------------------------------------------
+
+ This Source Code Form is subject to the terms of the Mozilla Public
+ License, v. 2.0. If a copy of the MPL was not distributed with this
+ file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+If it is not possible or desirable to put the notice in a particular
+file, then You may include the notice in a location (such as a LICENSE
+file in a relevant directory) where a recipient would be likely to look
+for such a notice.
+
+You may add additional accurate notices of copyright ownership.
+
+Exhibit B - "Incompatible With Secondary Licenses" Notice
+---------------------------------------------------------
+
+ This Source Code Form is "Incompatible With Secondary Licenses", as
+ defined by the Mozilla Public License, v. 2.0.
diff --git a/README b/README
index 7b2882aadf..5034578eed 100644
--- a/README
+++ b/README
@@ -1,32 +1,30 @@
-Postgres-XC Database Management System
-=====================================
+Postgres-XL Database Management System
+======================================
-This directory contains the source code distribution of the Postgres-XC
+This directory contains the source code distribution of the Postgres-XL
database management system.
-Postgres-XC is an advanced object-relational cluster database management
+Postgres-XL is an advanced object-relational cluster database management
system that supports an extended subset of the SQL standard, including
transactions, foreign keys, user-defined types and functions. This
distribution also contains C language bindings.
-Postgres-XC has many language interfaces similar to PostgreSQL, many of
+Postgres-XL has many language interfaces similar to PostgreSQL, many of
which are listed here:
http://www.postgresql.org/download
See the file INSTALL for instructions on how to build and install
-Postgres-XC. That file also lists supported operating systems and
+Postgres-XL. That file also lists supported operating systems and
hardware platforms and contains information regarding any other
-software packages that are required to build or run the Postgres-XC
-system. Changes between all Postgres-XC releases are recorded in the
+software packages that are required to build or run the Postgres-XL
+system. Changes between all Postgres-XL releases are recorded in the
file HISTORY. Copyright and license information can be found in the
file COPYRIGHT. A comprehensive documentation set is included in this
distribution; it can be read as described in the installation
instructions.
The latest version of this software may be obtained at
-http://sourceforge.net/projects/postgres-xc/. For more information look at our
-web site located at http://postgres-xc.sourceforge.net/.
+http://sourceforge.net/projects/postgres-xl/. For more information look at our
+web site located at http://postgres-xl.sourceforge.net/.
-More information about Postgres-XC Development Group is available at
-http://sourceforge.net/apps/mediawiki/postgres-xc/index.php?title=Charter.
diff --git a/configure b/configure
index 021eb9268e..fb6a7e76bb 100755
--- a/configure
+++ b/configure
@@ -1,8 +1,8 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.63 for Postgres-XC 1.1devel.
+# Generated by GNU Autoconf 2.63 for PostgreSQL 9.2beta2 (Postgres-XL 9.2.0).
#
-# Report bugs to <[email protected]>.
+# Report bugs to <[email protected]>.
#
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
# 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
@@ -31,6 +31,8 @@ esac
fi
+
+
# PATH needs CR
# Avoid depending upon Character Ranges.
as_cr_letters='abcdefghijklmnopqrstuvwxyz'
@@ -594,14 +596,12 @@ MAKEFLAGS=
SHELL=${CONFIG_SHELL-/bin/sh}
# Identity of this package.
-PACKAGE_NAME='Postgres-XC'
-PACKAGE_TARNAME='postgres-xc'
-# Package is based on former PostgreSQL, so base package version on that
-PACKAGE_VERSION='9.2beta2'
-# Postgres-XC 1.1devel is based on PostgreSQL 9.1beta2
-PACKAGE_XC_VERSION='1.1devel'
-PACKAGE_STRING='Postgres-XC 1.1devel'
-PACKAGE_BUGREPORT='[email protected]'
+PACKAGE_NAME='PostgreSQL'
+PACKAGE_TARNAME='postgresql'
+PACKAGE_VERSION='9.2beta2 (Postgres-XL 9.2.0)'
+PACKAGE_XC_VERSION='9.2.0'
+PACKAGE_STRING='PostgreSQL 9.2beta2 (Postgres-XL 9.2.0)'
+PACKAGE_BUGREPORT='[email protected]'
ac_unique_file="src/backend/access/common/heaptuple.c"
ac_default_prefix=/usr/local/pgsql
@@ -759,7 +759,6 @@ build_vendor
build_cpu
build
PG_MAJORVERSION
-PGXC_MAJORVERSION
configure_args
target_alias
host_alias
@@ -1412,7 +1411,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures Postgres-XC 1.1devel to adapt to many kinds of systems.
+\`configure' configures PostgreSQL 9.2beta2 (Postgres-XL 9.2.0) to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1477,7 +1476,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of Postgres-XC 1.1devel:";;
+ short | recursive ) echo "Configuration of PostgreSQL 9.2beta2 (Postgres-XL 9.2.0):";;
esac
cat <<\_ACEOF
@@ -1561,7 +1560,7 @@ Some influential environment variables:
Use these variables to override the choices made by `configure' or to help
it to find libraries and programs with nonstandard names/locations.
-Report bugs to <[email protected]>.
+Report bugs to <[email protected]>.
_ACEOF
ac_status=$?
fi
@@ -1624,7 +1623,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-Postgres-XC configure 1.1devel
+PostgreSQL configure 9.2beta2 (Postgres-XL 9.2.0)
generated by GNU Autoconf 2.63
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1640,7 +1639,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by Postgres-XC $as_me 1.1devel, which was
+It was created by PostgreSQL $as_me 9.2beta2 (Postgres-XL 9.2.0), which was
generated by GNU Autoconf 2.63. Invocation command line was
$ $0 $@
@@ -2069,12 +2068,6 @@ cat >>confdefs.h <<_ACEOF
#define PGXC_VERSION "$PACKAGE_XC_VERSION"
_ACEOF
-PGXC_MAJORVERSION=`expr "$PACKAGE_XC_VERSION" : '\([0-9][0-9]*\.[0-9][0-9]*\)'`
-
-cat >>confdefs.h <<_ACEOF
-#define PGXC_MAJORVERSION "$PGXC_MAJORVERSION"
-_ACEOF
-
# Make sure we can run config.sub.
$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
@@ -10249,9 +10242,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -10472,9 +10465,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -10691,9 +10684,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -10839,9 +10832,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -11004,9 +10997,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -11152,9 +11145,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -11324,9 +11317,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -11472,9 +11465,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -11620,9 +11613,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -11792,9 +11785,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -11940,9 +11933,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -12088,9 +12081,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -12244,9 +12237,9 @@ $as_echo "$as_me: WARNING: zlib.h: proceeding with the preprocessor's result" >&
{ $as_echo "$as_me:$LINENO: WARNING: zlib.h: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: zlib.h: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -12400,9 +12393,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -12548,9 +12541,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -12704,9 +12697,9 @@ $as_echo "$as_me: WARNING: krb5.h: proceeding with the preprocessor's result" >&
{ $as_echo "$as_me:$LINENO: WARNING: krb5.h: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: krb5.h: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -12848,9 +12841,9 @@ $as_echo "$as_me: WARNING: openssl/ssl.h: proceeding with the preprocessor's res
{ $as_echo "$as_me:$LINENO: WARNING: openssl/ssl.h: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: openssl/ssl.h: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -12989,9 +12982,9 @@ $as_echo "$as_me: WARNING: openssl/err.h: proceeding with the preprocessor's res
{ $as_echo "$as_me:$LINENO: WARNING: openssl/err.h: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: openssl/err.h: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -13139,9 +13132,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -13287,9 +13280,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -13443,9 +13436,9 @@ $as_echo "$as_me: WARNING: libxml/parser.h: proceeding with the preprocessor's r
{ $as_echo "$as_me:$LINENO: WARNING: libxml/parser.h: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: libxml/parser.h: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -13587,9 +13580,9 @@ $as_echo "$as_me: WARNING: libxslt/xslt.h: proceeding with the preprocessor's re
{ $as_echo "$as_me:$LINENO: WARNING: libxslt/xslt.h: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: libxslt/xslt.h: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -13738,9 +13731,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -13960,9 +13953,9 @@ $as_echo "$as_me: WARNING: dns_sd.h: proceeding with the preprocessor's result"
{ $as_echo "$as_me:$LINENO: WARNING: dns_sd.h: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: dns_sd.h: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -14111,9 +14104,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -14260,9 +14253,9 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
{ $as_echo "$as_me:$LINENO: WARNING: $ac_header: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -14423,7 +14416,7 @@ $as_echo "$as_me: WARNING: $ac_header: proceeding with the preprocessor's result
$as_echo "$as_me: WARNING: $ac_header: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
## ---------------------------------------- ##
-## Report this to [email protected] ##
+## Report this to [email protected] ##
## ---------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
@@ -23221,9 +23214,9 @@ $as_echo "$as_me: WARNING: pthread.h: proceeding with the preprocessor's result"
{ $as_echo "$as_me:$LINENO: WARNING: pthread.h: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: pthread.h: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -28568,9 +28561,9 @@ $as_echo "$as_me: WARNING: libintl.h: proceeding with the preprocessor's result"
{ $as_echo "$as_me:$LINENO: WARNING: libintl.h: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: libintl.h: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -28941,9 +28934,9 @@ $as_echo "$as_me: WARNING: tcl.h: proceeding with the preprocessor's result" >&2
{ $as_echo "$as_me:$LINENO: WARNING: tcl.h: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: tcl.h: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -29089,9 +29082,9 @@ $as_echo "$as_me: WARNING: Python.h: proceeding with the preprocessor's result"
{ $as_echo "$as_me:$LINENO: WARNING: Python.h: in the future, the compiler will take precedence" >&5
$as_echo "$as_me: WARNING: Python.h: in the future, the compiler will take precedence" >&2;}
( cat <<\_ASBOX
-## ---------------------------------------- ##
-## Report this to [email protected] ##
-## ---------------------------------------- ##
+## --------------------------------------- ##
+## Report this to [email protected] ##
+## --------------------------------------- ##
_ASBOX
) | sed "s/^/$as_me: WARNING: /" >&2
;;
@@ -29743,9 +29736,9 @@ cat >>confdefs.h <<_ACEOF
#define PG_VERSION_STR "PostgreSQL $PACKAGE_VERSION on $host, compiled by $cc_string, `expr $ac_cv_sizeof_void_p \* 8`-bit"
_ACEOF
-# Supply additional version name for Postgres-XC
+# Supply additional version name for Postgres-XL
cat >>confdefs.h <<_ACEOF
-#define PGXC_VERSION_STR "Postgres-XC $PACKAGE_XC_VERSION on $host, based on PostgreSQL $PACKAGE_VERSION, compiled by $cc_string, `expr $ac_cv_sizeof_void_p \* 8`-bit"
+#define PGXC_VERSION_STR "Postgres-XL $PACKAGE_XC_VERSION on $host, based on PostgreSQL $PACKAGE_VERSION, compiled by $cc_string, `expr $ac_cv_sizeof_void_p \* 8`-bit"
_ACEOF
# Supply a numeric version string for use by 3rd party add-ons
@@ -29758,17 +29751,10 @@ cat >>confdefs.h <<_ACEOF
#define PG_VERSION_NUM $PG_VERSION_NUM
_ACEOF
-# Supply a numeric version string specific for Postgres-XC
-PGXC_VERSION_NUM="`echo "$PACKAGE_XC_VERSION" | sed 's/[A-Za-z].*$//' |
-tr '.' ' ' |
-$AWK '{printf "%d%02d%02d", $1, $2, (NF >= 3) ? $3 : 0}'`"
-
-cat >>confdefs.h <<_ACEOF
-#define PGXC_VERSION_NUM $PGXC_VERSION_NUM
-_ACEOF
# For PGXC, set -DPGXC by default. This can be overriden with -UPGXC if the user sets it.
-CFLAGS="-DPGXC $CFLAGS"
+# For Postgres-XL, set both -DPGXC and -DXCP
+CFLAGS="-DPGXC -DXCP $CFLAGS"
# Begin output steps
@@ -30240,7 +30226,7 @@ exec 6>&1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by Postgres-XC $as_me 1.1devel, which was
+This file was extended by PostgreSQL $as_me 9.2beta2 (Postgres-XL 9.2.0), which was
generated by GNU Autoconf 2.63. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -30307,7 +30293,7 @@ Report bugs to <[email protected]>."
_ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_version="\\
-Postgres-XC config.status 1.1devel
+PostgreSQL config.status 9.2beta2 (Postgres-XL 9.2)
configured by $0, generated by GNU Autoconf 2.63,
with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
@@ -30491,7 +30477,7 @@ $debug ||
if test -n "$CONFIG_FILES"; then
-ac_cr=' '
+ac_cr=''
ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
ac_cs_awk_cr='\\r'
diff --git a/configure.in b/configure.in
index 6769f790dc..892cd09b23 100644
--- a/configure.in
+++ b/configure.in
@@ -17,7 +17,7 @@ dnl Read the Autoconf manual for details.
dnl
m4_pattern_forbid(^PGAC_)dnl to catch undefined macros
-AC_INIT([Postgres-XC], [1.1devel], [[email protected]])
+AC_INIT([PostgreSQL], [9.2beta2 (Postgres-XL 9.2)], [[email protected]])
m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.63], [], [m4_fatal([Autoconf version 2.63 is required.
Untested combinations of 'autoconf' and PostgreSQL versions are not
diff --git a/contrib/Makefile b/contrib/Makefile
index ad449ef10c..61f61e6ecd 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -51,7 +51,8 @@ SUBDIRS = \
test_parser \
tsearch2 \
unaccent \
- vacuumlo
+ vacuumlo \
+ stormstats
ifeq ($(with_openssl),yes)
SUBDIRS += sslinfo
diff --git a/contrib/pgxc_ctl/pgxc_ctl_bash_conf_part b/contrib/pgxc_ctl/pgxc_ctl_bash_conf_part
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/contrib/pgxc_ctl/pgxc_ctl_bash_conf_part
diff --git a/contrib/stormstats/Makefile b/contrib/stormstats/Makefile
new file mode 100644
index 0000000000..961489a501
--- /dev/null
+++ b/contrib/stormstats/Makefile
@@ -0,0 +1,15 @@
+MODULE_big = stormstats
+OBJS = stormstats.o
+
+EXTENSION = stormstats
+DATA = stormstats--1.0.sql stormstats--unpackaged--1.0.sql
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/stormstats
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/stormstats/stormstats--1.0.sql b/contrib/stormstats/stormstats--1.0.sql
new file mode 100644
index 0000000000..2ea2b32a6e
--- /dev/null
+++ b/contrib/stormstats/stormstats--1.0.sql
@@ -0,0 +1,17 @@
+CREATE FUNCTION storm_database_stats(
+ OUT datname text,
+ OUT conn_cnt int8,
+ OUT select_cnt int8,
+ OUT insert_cnt int8,
+ OUT update_cnt int8,
+ OUT delete_cnt int8,
+ OUT ddl_cnt int8
+)
+RETURNS SETOF record
+AS 'MODULE_PATHNAME'
+LANGUAGE C;
+
+-- Register a view on the function for ease of use.
+CREATE VIEW storm_database_stats AS
+ SELECT * FROM storm_database_stats();
+
diff --git a/contrib/stormstats/stormstats--unpackaged--1.0.sql b/contrib/stormstats/stormstats--unpackaged--1.0.sql
new file mode 100644
index 0000000000..df9f3a033d
--- /dev/null
+++ b/contrib/stormstats/stormstats--unpackaged--1.0.sql
@@ -0,0 +1,5 @@
+/* contrib/stormstats/stormstats--unpackaged--1.0.sql */
+
+ALTER EXTENSION stormstats ADD function storm_database_stats();
+ALTER EXTENSION stormstats ADD view storm_database_stats;
+
diff --git a/contrib/stormstats/stormstats.c b/contrib/stormstats/stormstats.c
new file mode 100644
index 0000000000..3a32d7ede8
--- /dev/null
+++ b/contrib/stormstats/stormstats.c
@@ -0,0 +1,898 @@
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "catalog/pg_type.h"
+#include "executor/spi.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/spin.h"
+#include "access/hash.h"
+
+#include "tcop/utility.h"
+#include "commands/dbcommands.h"
+#include "utils/builtins.h"
+#include "utils/syscache.h"
+#include "utils/snapmgr.h"
+#include "libpq/auth.h"
+#include "optimizer/planner.h"
+#include "nodes/makefuncs.h"
+#include "funcapi.h"
+#include "stormstats.h"
+
+#include "pgxc/pgxc.h"
+#include "pgxc/pgxcnode.h"
+#include "pgxc/planner.h"
+#include "pgxc/execRemote.h"
+
+/* mark this dynamic library to be compatible with PG */
+PG_MODULE_MAGIC;
+
+/* Location of stats file */
+#define STORM_DUMP_FILE "global/storm.stat"
+
+/* This constant defines the magic number in the stats file header */
+static const uint32 STORM_FILE_HEADER = 0x20120229;
+
+#define STORM_STATS_COLS 7
+
+typedef struct ssHashKey
+{
+ int dbname_len;
+ const char *dbname_ptr;
+} ssHashKey;
+
+typedef struct EventCounters
+{
+ int64 conn_cnt;
+ int64 select_cnt;
+ int64 insert_cnt;
+ int64 update_cnt;
+ int64 delete_cnt;
+ int64 ddl_cnt;
+} EventCounters;
+
+typedef struct StormStatsEntry
+{
+ ssHashKey key; /* hash key of entry - MUST BE FIRST */
+ EventCounters counters;
+ slock_t mutex;
+ char dbname[1]; /* VARIABLE LENGTH ARRAY - MUST BE LAST */
+
+} StormStatsEntry;
+
+/* Local hash table entry, no mutex needed */
+typedef struct LocalStatsEntry
+{
+ ssHashKey key; /* hash key of entry */
+ EventCounters counters;
+ char dbname[NAMEDATALEN];
+} LocalStatsEntry;
+
+typedef struct StormSharedState
+{
+ LWLockId lock;
+} StormSharedState;
+
+static bool sp_save; /* whether to save stats across shutdown */
+
+extern PlannedStmt *planner_callback(Query *parse, int cursorOptions, ParamListInfo boundParams);
+extern void auth_check(Port *port, int status);
+
+static void sp_shmem_startup(void);
+static void sp_shmem_shutdown(int code, Datum arg);
+static Size hash_memsize(void);
+
+static uint32 ss_hash_fn(const void *key, Size keysize);
+static int ss_match_fn(const void *key1, const void *key2, Size keysize);
+static void stats_store(const char *dbname, CmdType c, bool isConnEvent, bool isUtilEvent);
+
+static StormStatsEntry *alloc_event_entry(ssHashKey *key);
+
+/* Functions */
+Datum storm_database_stats(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(storm_database_stats);
+
+/* Shared Memory Objects */
+static HTAB *StatsEntryHash = NULL;
+static StormSharedState *shared_state = NULL;
+
+/* Session level objects */
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+static ClientAuthentication_hook_type original_client_auth_hook = NULL;
+
+static ProcessUtility_hook_type prev_ProcessUtility = NULL;
+
+static int max_tracked_dbs;
+
+static void
+ProcessUtility_callback(Node *parsetree,
+ const char *queryString,
+ ParamListInfo params,
+ bool isTopLevel,
+ DestReceiver *dest,
+#ifdef PGXC
+ bool sentToRemote,
+#endif /* PGXC */
+ char *completionTag)
+{
+ elog( DEBUG1, "STORMSTATS: using plugin." );
+
+ standard_ProcessUtility(parsetree, queryString, params, isTopLevel, dest,
+#ifdef PGXC
+ sentToRemote,
+#endif /* PGXC */
+ completionTag);
+
+ stats_store(get_database_name(MyDatabaseId), CMD_UNKNOWN, false, true);
+
+ /*
+ * Check if it's a CREATE/DROP DATABASE command. Update entries in the
+ * shared hash table accordingly.
+ */
+ switch (nodeTag(parsetree))
+ {
+ case T_CreatedbStmt:
+ {
+ ssHashKey key;
+ StormStatsEntry *entry;
+ CreatedbStmt *stmt = (CreatedbStmt *)parsetree;
+
+ /* Set up key for hashtable search */
+ key.dbname_len = strlen(stmt->dbname);
+ key.dbname_ptr = stmt->dbname;
+
+ /*
+ * Lookup the hash table entry with exclusive lock. We have to
+ * manipulate the entries immediately anyways..
+ */
+ LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
+
+ entry = (StormStatsEntry *) hash_search(StatsEntryHash, &key, HASH_FIND, NULL);
+
+ /* What do we do if we find an entry already? We WARN for now */
+ if (!entry)
+ entry = alloc_event_entry(&key);
+ else
+ ereport(WARNING,
+ (errmsg("entry exists already for database %s!",
+ entry->dbname)));
+ LWLockRelease(shared_state->lock);
+ break;
+ }
+ case T_DropdbStmt:
+ {
+ ssHashKey key;
+ StormStatsEntry *entry;
+ DropdbStmt *stmt = (DropdbStmt *)parsetree;
+
+ /* Set up key for hashtable search */
+ key.dbname_len = strlen(stmt->dbname);
+ key.dbname_ptr = stmt->dbname;
+
+ /*
+ * Lookup the hash table entry with exclusive lock. We have to
+ * manipulate the entries immediately anyways..
+ */
+ LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
+
+ entry = (StormStatsEntry *) hash_search(StatsEntryHash, &key, HASH_REMOVE, NULL);
+
+ /* What do we do if we do not find an entry? We WARN for now */
+ if (!entry && !stmt->missing_ok)
+ ereport(WARNING,
+ (errmsg("entry does not exist for database %s!",
+ stmt->dbname)));
+ LWLockRelease(shared_state->lock);
+ break;
+ }
+ default:
+ /* Nothing */;
+ }
+}
+
+void
+_PG_init(void)
+{
+ if (!process_shared_preload_libraries_in_progress)
+ return;
+
+ DefineCustomIntVariable("storm_stats.max_tracked_databases",
+ "Sets the maximum number of databases tracked.",
+ NULL,
+ &max_tracked_dbs,
+ 1000,
+ 1,
+ INT_MAX,
+ PGC_POSTMASTER,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ DefineCustomBoolVariable("storm_stats.save",
+ "Save statistics across server shutdowns.",
+ NULL,
+ &sp_save,
+ true,
+ PGC_SIGHUP,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ EmitWarningsOnPlaceholders("storm_stats");
+
+ RequestAddinShmemSpace(hash_memsize());
+ RequestAddinLWLocks(1);
+
+ prev_shmem_startup_hook = shmem_startup_hook;
+ shmem_startup_hook = sp_shmem_startup;
+ planner_hook = planner_callback;
+
+ original_client_auth_hook = ClientAuthentication_hook;
+ ClientAuthentication_hook = auth_check;
+
+ prev_ProcessUtility = ProcessUtility_hook;
+ ProcessUtility_hook = ProcessUtility_callback;
+
+ elog( DEBUG1, "STORMSTATS: plugin loaded" );
+}
+
+void
+_PG_fini(void)
+{
+ shmem_startup_hook = prev_shmem_startup_hook;
+ planner_hook = NULL;
+ ProcessUtility_hook = prev_ProcessUtility;
+
+ elog( DEBUG1, "STORMSTATS: plugin unloaded." );
+}
+
+static void sp_shmem_startup(void)
+{
+ HASHCTL event_ctl;
+ bool found;
+ FILE *file;
+ uint32 header;
+ int32 num;
+ int32 i;
+ int buffer_size;
+ char *buffer = NULL;
+
+ if (prev_shmem_startup_hook)
+ prev_shmem_startup_hook();
+
+ /*
+ * Create or attach to the shared memory state, including hash table
+ */
+ LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+ shared_state = ShmemInitStruct("storm_stats state", sizeof(StormSharedState), &found);
+ if (!shared_state)
+ elog(ERROR, "out of shared memory");
+
+ if (!found)
+ {
+ shared_state->lock = LWLockAssign();
+ }
+
+ memset(&event_ctl, 0, sizeof(event_ctl));
+
+ event_ctl.keysize = sizeof(ssHashKey);
+ event_ctl.entrysize = sizeof(StormStatsEntry) + NAMEDATALEN;
+ event_ctl.hash = ss_hash_fn;
+ event_ctl.match = ss_match_fn;
+
+ StatsEntryHash = ShmemInitHash("storm_stats event hash", max_tracked_dbs,
+ max_tracked_dbs, &event_ctl,
+ HASH_ELEM | HASH_FUNCTION | HASH_COMPARE);
+ if (!StatsEntryHash)
+ elog(ERROR, "out of shared memory");
+
+ LWLockRelease(AddinShmemInitLock);
+
+ /*
+ * If we're in the postmaster (or a standalone backend...), set up a shmem
+ * exit hook to dump the statistics to disk.
+ */
+ if (!IsUnderPostmaster)
+ on_shmem_exit(sp_shmem_shutdown, (Datum) 0);
+
+ /*
+ * Attempt to load old statistics from the dump file, if this is the first
+ * time through and we weren't told not to.
+ */
+ if (found || !sp_save)
+ return;
+
+ /*
+ * Note: we don't bother with locks here, because there should be no other
+ * processes running when this code is reached.
+ */
+ file = AllocateFile(STORM_DUMP_FILE, PG_BINARY_R);
+ if (file == NULL)
+ {
+ if (errno == ENOENT)
+ return; /* ignore not-found error */
+ goto error;
+ }
+
+ buffer_size = NAMEDATALEN;
+ buffer = (char *) palloc(buffer_size);
+
+ if (fread(&header, sizeof(uint32), 1, file) != 1 ||
+ header != STORM_FILE_HEADER ||
+ fread(&num, sizeof(int32), 1, file) != 1)
+ goto error;
+
+ for (i = 0; i < num; i++)
+ {
+ StormStatsEntry temp;
+ StormStatsEntry *entry;
+
+ if (fread(&temp, offsetof(StormStatsEntry, mutex), 1, file) != 1)
+ goto error;
+
+ if (temp.key.dbname_len >= buffer_size)
+ {
+ buffer = (char *) repalloc(buffer, temp.key.dbname_len + 1);
+ buffer_size = temp.key.dbname_len + 1;
+ }
+
+ if (fread(buffer, 1, temp.key.dbname_len, file) != temp.key.dbname_len)
+ goto error;
+ buffer[temp.key.dbname_len] = '\0';
+
+ temp.key.dbname_ptr = buffer;
+
+ /* make the hashtable entry (discards old entries if too many) */
+ entry = alloc_event_entry(&temp.key);
+
+ /* copy in the actual stats */
+ entry->counters = temp.counters;
+ }
+
+ pfree(buffer);
+ FreeFile(file);
+ return;
+
+error:
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not read stormstats file \"%s\": %m",
+ STORM_DUMP_FILE)));
+ if (buffer)
+ pfree(buffer);
+ if (file)
+ FreeFile(file);
+ /* If possible, throw away the bogus file; ignore any error */
+ unlink(STORM_DUMP_FILE);
+}
+
+/*
+ * shmem_shutdown hook: Dump statistics into file.
+ *
+ * Note: we don't bother with acquiring lock, because there should be no
+ * other processes running when this is called.
+ */
+static void
+sp_shmem_shutdown(int code, Datum arg)
+{
+ FILE *file;
+ HASH_SEQ_STATUS hash_seq;
+ int32 num_entries;
+ StormStatsEntry *entry;
+
+ /* Don't try to dump during a crash. */
+ if (code)
+ return;
+
+ /* Safety check ... shouldn't get here unless shmem is set up. */
+ if (!shared_state || !StatsEntryHash)
+ return;
+
+ /* Don't dump if told not to. */
+ if (!sp_save)
+ return;
+
+ file = AllocateFile(STORM_DUMP_FILE, PG_BINARY_W);
+ if (file == NULL)
+ goto error;
+
+ if (fwrite(&STORM_FILE_HEADER, sizeof(uint32), 1, file) != 1)
+ goto error;
+ num_entries = hash_get_num_entries(StatsEntryHash);
+ if (fwrite(&num_entries, sizeof(int32), 1, file) != 1)
+ goto error;
+
+ hash_seq_init(&hash_seq, StatsEntryHash);
+ while ((entry = hash_seq_search(&hash_seq)) != NULL)
+ {
+ int len = entry->key.dbname_len;
+
+ if (fwrite(entry, offsetof(StormStatsEntry, mutex), 1, file) != 1 ||
+ fwrite(entry->dbname, 1, len, file) != len)
+ goto error;
+ }
+
+ if (FreeFile(file))
+ {
+ file = NULL;
+ goto error;
+ }
+
+ return;
+
+error:
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not write stormstats file \"%s\": %m",
+ STORM_DUMP_FILE)));
+
+ if (file)
+ FreeFile(file);
+ unlink(STORM_DUMP_FILE);
+}
+
+PlannedStmt *planner_callback(Query *parse, int cursorOptions, ParamListInfo boundParams)
+{
+ PlannedStmt *plan;
+
+ elog( DEBUG1, "STORMSTATS: using plugin." );
+
+ /* Generate a plan */
+ plan = standard_planner(parse, cursorOptions, boundParams);
+
+ stats_store(get_database_name(MyDatabaseId), parse->commandType, false, false);
+
+ return plan;
+}
+
+void auth_check(Port *port, int status)
+{
+ elog( DEBUG1, "STORMSTATS: using plugin." );
+
+ /*
+ * Any other plugins which use ClientAuthentication_hook.
+ */
+ if (original_client_auth_hook)
+ original_client_auth_hook(port, status);
+
+ if (status == STATUS_OK)
+ {
+ stats_store(port->database_name, CMD_UNKNOWN, true, false);
+ }
+}
+
+static Size hash_memsize(void)
+{
+ Size size;
+ Size events_size;
+ Size state_size;
+
+ events_size = hash_estimate_size(max_tracked_dbs, MAXALIGN(sizeof(StormStatsEntry)));
+ state_size = MAXALIGN(sizeof(StormSharedState));
+
+ size = add_size(events_size, state_size);
+
+ return size;
+}
+
+static StormStatsEntry *alloc_event_entry(ssHashKey *key)
+{
+ StormStatsEntry *entry;
+ bool found;
+
+ if (hash_get_num_entries(StatsEntryHash) >= max_tracked_dbs)
+ {
+ elog(ERROR, "STORMSTATS: The maximum number of tracked databases have been reached");
+ return NULL;
+ }
+
+ /* Find or create an entry with desired hash code */
+ entry = (StormStatsEntry *) hash_search(StatsEntryHash, key, HASH_ENTER, &found);
+
+ if (!found)
+ {
+ entry->key.dbname_ptr = entry->dbname;
+ memset(&entry->counters, 0, sizeof(EventCounters));
+ SpinLockInit(&entry->mutex);
+
+ memcpy(entry->dbname, key->dbname_ptr, key->dbname_len);
+ entry->dbname[key->dbname_len] = '\0';
+ }
+
+ return entry;
+}
+
+/*
+ * Calculate hash value for a key
+ */
+static uint32
+ss_hash_fn(const void *key, Size keysize)
+{
+ const ssHashKey *k = (const ssHashKey *) key;
+
+ /* we don't bother to include encoding in the hash */
+ return DatumGetUInt32(hash_any((const unsigned char *) k->dbname_ptr,
+ k->dbname_len));
+}
+
+/*
+ * Compare two keys - zero means match
+ */
+static int
+ss_match_fn(const void *key1, const void *key2, Size keysize)
+{
+ const ssHashKey *k1 = (const ssHashKey *) key1;
+ const ssHashKey *k2 = (const ssHashKey *) key2;
+
+ if (k1->dbname_len == k2->dbname_len &&
+ memcmp(k1->dbname_ptr, k2->dbname_ptr, k1->dbname_len) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static void
+stats_store(const char *dbname, CmdType c, bool isConnEvent, bool isUtilEvent)
+{
+ ssHashKey key;
+ StormStatsEntry *entry;
+
+ if (!shared_state || !StatsEntryHash)
+ return;
+
+ /* Set up key for hashtable search */
+ key.dbname_len = strlen(dbname);
+ key.dbname_ptr = dbname;
+
+ /* Lookup the hash table entry with shared lock. */
+ LWLockAcquire(shared_state->lock, LW_SHARED);
+
+ entry = (StormStatsEntry *) hash_search(StatsEntryHash, &key, HASH_FIND, NULL);
+ if (!entry)
+ {
+ /* Must acquire exclusive lock to add a new entry. */
+ LWLockRelease(shared_state->lock);
+ LWLockAcquire(shared_state->lock, LW_EXCLUSIVE);
+ entry = alloc_event_entry(&key);
+ }
+
+ /* Grab the spinlock while updating the counters. */
+ {
+ volatile StormStatsEntry *e = (volatile StormStatsEntry *) entry;
+
+ SpinLockAcquire(&e->mutex);
+
+ if (isConnEvent) {
+ e->counters.conn_cnt += 1;
+ } else if (isUtilEvent) {
+ e->counters.ddl_cnt += 1;
+ } else {
+ switch (c)
+ {
+ case CMD_SELECT:
+ e->counters.select_cnt += 1;
+ break;
+ case CMD_INSERT:
+ e->counters.insert_cnt += 1;
+ break;
+ case CMD_UPDATE:
+ e->counters.update_cnt += 1;
+ break;
+ case CMD_DELETE:
+ e->counters.delete_cnt += 1;
+ break;
+ case CMD_UTILITY:
+ case CMD_UNKNOWN:
+ case CMD_NOTHING:
+ break;
+ }
+ }
+ SpinLockRelease(&e->mutex);
+ }
+
+ LWLockRelease(shared_state->lock);
+}
+
+/*
+ * Gather statistics from remote coordinators
+ */
+static HTAB *
+storm_gather_remote_coord_info(Oid funcid)
+{
+ bool found;
+ EState *estate;
+ TupleTableSlot *result;
+ RemoteQuery *step;
+ RemoteQueryState *node;
+ int i, ncolumns;
+ HeapTuple tp;
+ TupleDesc tupdesc;
+ MemoryContext oldcontext;
+ HTAB *LocalStatsHash;
+ HASHCTL event_ctl;
+
+ /*
+ * We will sort output by database name, should make adding up info from
+ * multiple remote coordinators easier
+ */
+ char *query = "SELECT * FROM storm_database_stats() ORDER BY datname";
+
+ /* Build up RemoteQuery */
+ step = makeNode(RemoteQuery);
+
+ step->combine_type = COMBINE_TYPE_NONE;
+ step->exec_nodes = NULL;
+ step->sql_statement = query;
+ step->force_autocommit = false;
+ step->read_only = true;
+ step->exec_type = EXEC_ON_COORDS;
+
+ /* Build a local hash table to contain info from remote nodes */
+ memset(&event_ctl, 0, sizeof(event_ctl));
+
+ event_ctl.keysize = sizeof(ssHashKey);
+ event_ctl.entrysize = sizeof(LocalStatsEntry);
+ event_ctl.hash = ss_hash_fn;
+ event_ctl.match = ss_match_fn;
+
+ LocalStatsHash = hash_create("storm_stats local hash", max_tracked_dbs,
+ &event_ctl,
+ HASH_ELEM | HASH_FUNCTION | HASH_COMPARE);
+ if (!LocalStatsHash)
+ elog(ERROR, "out of memory");
+
+ /*
+ * Add targetlist entries. We use the proc oid to get the tupledesc for
+ * this. We could have hardcoded the types of existing set of columns, but
+ * if we change the columns later for whatever reasons, this keeps us sane
+ */
+ tp = SearchSysCache1(PROCOID, ObjectIdGetDatum(funcid));
+
+ /* Build a tupdesc of all the OUT parameters */
+ tupdesc = build_function_result_tupdesc_t(tp);
+ ncolumns = tupdesc->natts;
+
+ for (i = 0; i < ncolumns; ++i)
+ {
+ Var *var;
+ TargetEntry *tle;
+
+ var = makeVar(1,
+ tupdesc->attrs[i]->attnum,
+ tupdesc->attrs[i]->atttypid,
+ tupdesc->attrs[i]->atttypmod,
+ InvalidOid,
+ 0);
+
+ tle = makeTargetEntry((Expr *) var, tupdesc->attrs[i]->attnum, NULL, false);
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist, tle);
+ }
+ ReleaseSysCache(tp);
+
+ /* Execute query on the data nodes */
+ estate = CreateExecutorState();
+
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ estate->es_snapshot = GetActiveSnapshot();
+
+ node = ExecInitRemoteQuery(step, estate, 0);
+ MemoryContextSwitchTo(oldcontext);
+ /* get ready to combine results */
+ result = ExecRemoteQuery(node);
+ while (result != NULL && !TupIsNull(result))
+ {
+ Datum value;
+ bool isnull;
+ ssHashKey key;
+ LocalStatsEntry *entry;
+ char *dbname;
+
+ /* Process statistics from the coordinator nodes */
+ value = slot_getattr(result, 1, &isnull); /* datname */
+ if (isnull)
+ ereport(ERROR,
+ (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+ errmsg("database name must not be null")));
+
+ dbname = TextDatumGetCString(value);
+
+ /* Set up key for hashtable search */
+ key.dbname_len = strlen(dbname);
+ key.dbname_ptr = dbname;
+
+ /* Find or create an entry with desired hash code */
+ entry = (LocalStatsEntry *) hash_search(LocalStatsHash, &key, HASH_ENTER, &found);
+ if (!found)
+ {
+ entry->key.dbname_ptr = entry->dbname;
+ memset(&entry->counters, 0, sizeof(EventCounters));
+ memcpy(entry->dbname, key.dbname_ptr, key.dbname_len);
+ entry->dbname[key.dbname_len] = '\0';
+ }
+
+ value = slot_getattr(result, 2, &isnull); /* conn_cnt */
+ if (!isnull)
+ entry->counters.conn_cnt += DatumGetInt64(value);
+
+ value = slot_getattr(result, 3, &isnull); /* select_cnt */
+ if (!isnull)
+ entry->counters.select_cnt += DatumGetInt64(value);
+
+ value = slot_getattr(result, 4, &isnull); /* insert_cnt */
+ if (!isnull)
+ entry->counters.insert_cnt += DatumGetInt64(value);
+
+ value = slot_getattr(result, 5, &isnull); /* update_cnt */
+ if (!isnull)
+ entry->counters.update_cnt += DatumGetInt64(value);
+
+ value = slot_getattr(result, 6, &isnull); /* delete_cnt */
+ if (!isnull)
+ entry->counters.delete_cnt += DatumGetInt64(value);
+
+ value = slot_getattr(result, 7, &isnull); /* ddl_cnt */
+ if (!isnull)
+ entry->counters.ddl_cnt += DatumGetInt64(value);
+
+ /* fetch next */
+ result = ExecRemoteQuery(node);
+ }
+ ExecEndRemoteQuery(node);
+
+ return LocalStatsHash;
+}
+
+Datum storm_database_stats(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ TupleDesc tupdesc;
+ Tuplestorestate *tupstore;
+ MemoryContext per_query_ctx;
+ MemoryContext oldcontext;
+ HASH_SEQ_STATUS hash_seq;
+ StormStatsEntry *entry;
+ HTAB *LocalStatsHash = NULL;
+
+ if (IS_PGXC_DATANODE)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("invalid invocation on data node")));
+
+ if (!shared_state || !StatsEntryHash)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("storm_stats must be loaded via shared_preload_libraries")));
+
+ /* check to see if caller supports us returning a tuplestore */
+ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set")));
+ if (!(rsinfo->allowedModes & SFRM_Materialize))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("materialize mode required, but it is not " \
+ "allowed in this context")));
+
+ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+ oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+ /*
+ * Query the rest of the coordinators and get their stats. Do this only if
+ * you are query originator. Otherwise just provide your local info and
+ * return
+ */
+ if (IsConnFromApp())
+ LocalStatsHash = storm_gather_remote_coord_info(fcinfo->flinfo->fn_oid);
+
+ tupdesc = CreateTemplateTupleDesc(STORM_STATS_COLS, false);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 1, "dbname", TEXTOID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 2, "conn_cnt", INT8OID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 3, "select_cnt", INT8OID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 4, "insert_cnt", INT8OID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 5, "update_cnt", INT8OID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 6, "delete_cnt", INT8OID, -1, 0);
+ TupleDescInitEntry(tupdesc, (AttrNumber) 7, "ddl_cnt", INT8OID, -1, 0);
+
+ tupstore = tuplestore_begin_heap(true, false, work_mem);
+ rsinfo->returnMode = SFRM_Materialize;
+ rsinfo->setResult = tupstore;
+ rsinfo->setDesc = tupdesc;
+
+ LWLockAcquire(shared_state->lock, LW_SHARED);
+
+ hash_seq_init(&hash_seq, StatsEntryHash);
+ while ((entry = hash_seq_search(&hash_seq)) != NULL)
+ {
+ Datum values[STORM_STATS_COLS];
+ bool nulls[STORM_STATS_COLS];
+ int i = 0;
+ EventCounters tmp, lcl;
+
+ /* generate junk in short-term context */
+ MemoryContextSwitchTo(oldcontext);
+
+ memset(values, 0, sizeof(values));
+ memset(nulls, 0, sizeof(nulls));
+ memset(&lcl, 0, sizeof(lcl));
+
+ values[i++] = CStringGetTextDatum(entry->dbname);
+
+ /* copy counters to a local variable to keep locking time short */
+ {
+ volatile StormStatsEntry *e = (volatile StormStatsEntry *) entry;
+
+ SpinLockAcquire(&e->mutex);
+ tmp = e->counters;
+ SpinLockRelease(&e->mutex);
+ }
+
+ /* See if LocalStatsHash has additional info to provide */
+ if (LocalStatsHash)
+ {
+ ssHashKey key;
+ LocalStatsEntry *le;
+ bool found;
+
+ /* Set up key for hashtable search */
+ key.dbname_len = strlen(entry->dbname);
+ key.dbname_ptr = entry->dbname;
+
+ /* Find an entry with desired hash code */
+ le = (LocalStatsEntry *) hash_search(LocalStatsHash, &key, HASH_FIND, &found);
+
+ /*
+ * What should we do if entry is not found on the other
+ * coordinators? WARN for now..
+ */
+ if (!found)
+ {
+ ereport(WARNING,
+ (errmsg("no stats collected from remote coordinators for database %s!",
+ entry->dbname)));
+ }
+ else
+ {
+ tmp.ddl_cnt += le->counters.ddl_cnt;
+ tmp.conn_cnt += le->counters.conn_cnt;
+ tmp.select_cnt += le->counters.select_cnt;
+ tmp.insert_cnt += le->counters.insert_cnt;
+ tmp.update_cnt += le->counters.update_cnt;
+ tmp.delete_cnt += le->counters.delete_cnt;
+ }
+ }
+
+ values[i++] = Int64GetDatumFast(tmp.conn_cnt);
+ values[i++] = Int64GetDatumFast(tmp.select_cnt);
+ values[i++] = Int64GetDatumFast(tmp.insert_cnt);
+ values[i++] = Int64GetDatumFast(tmp.update_cnt);
+ values[i++] = Int64GetDatumFast(tmp.delete_cnt);
+ values[i++] = Int64GetDatumFast(tmp.ddl_cnt);
+
+ Assert(i == STORM_STATS_COLS);
+
+ /* switch to appropriate context while storing the tuple */
+ MemoryContextSwitchTo(per_query_ctx);
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+ }
+
+ LWLockRelease(shared_state->lock);
+
+ /* clean up and return the tuplestore */
+ tuplestore_donestoring(tupstore);
+
+ /* destroy local hash table */
+ if (LocalStatsHash)
+ hash_destroy(LocalStatsHash);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ return (Datum) 0;
+}
diff --git a/contrib/stormstats/stormstats.control b/contrib/stormstats/stormstats.control
new file mode 100644
index 0000000000..b7816feef9
--- /dev/null
+++ b/contrib/stormstats/stormstats.control
@@ -0,0 +1,5 @@
+# stormstats extension
+comment = 'collect deeper database stats for StormDB'
+default_version = '1.0'
+module_pathname = '$libdir/stormstats'
+relocatable = true
diff --git a/contrib/stormstats/stormstats.h b/contrib/stormstats/stormstats.h
new file mode 100644
index 0000000000..c11846d0a5
--- /dev/null
+++ b/contrib/stormstats/stormstats.h
@@ -0,0 +1,9 @@
+#ifndef STORMSTATS_H
+#define STORMSTATS_H
+
+#include "postgres.h"
+
+extern void _PG_init(void);
+extern void _PG_fini(void);
+
+#endif /* STORMSTATS_H */
diff --git a/src/backend/Makefile b/src/backend/Makefile
index 828c084ceb..611d29fa87 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -50,8 +50,8 @@ OBJS = $(SUBDIROBJS) $(LOCALOBJS) \
$(top_builddir)/src/interfaces/libpq/pqexpbuffer.o \
$(top_builddir)/src/port/libpgport_srv.a \
$(top_builddir)/src/gtm/client/libgtmclient.a \
- $(top_builddir)/src/gtm/common/libgtmcommon.a \
- $(top_builddir)/src/interfaces/libpq/libpq.a
+ $(top_builddir)/src/gtm/common/libgtm.a \
+ $(top_builddir)/src/gtm/libpq/libpqcomm.a
# We put libpgport into OBJS, so remove it from LIBS; also add libldap
LIBS := $(filter-out -lpgport, $(LIBS)) $(LDAP_LIBS_BE)
@@ -147,15 +147,6 @@ catalog/schemapg.h: | submake-schemapg
$(top_builddir)/src/port/libpgport_srv.a: | submake-libpgport
-# Those are rules to create dependent GTM libraries automatically
-$(top_builddir)/src/interfaces/libpq/libpq.a:
- $(MAKE) -C $(top_builddir)/src/interfaces/libpq libpq.a
-
-$(top_builddir)/src/gtm/common/libgtmcommon.a:
- $(MAKE) -C $(top_builddir)/src/gtm/common libgtmcommon.a
-
-$(top_builddir)/src/gtm/client/libgtmclient.a:
- $(MAKE) -C $(top_builddir)/src/gtm/client libgtmclient.a
# The postgres.o target is needed by the rule in Makefile.global that
# creates the exports file when MAKE_EXPORTS = true.
diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c
index 998fc5d58d..62479c04bb 100644
--- a/src/backend/access/common/heaptuple.c
+++ b/src/backend/access/common/heaptuple.c
@@ -45,6 +45,11 @@
* and we'd like to still refer to them via C struct offsets.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -63,7 +68,9 @@
#include "access/sysattr.h"
#include "access/tuptoaster.h"
#include "executor/tuptable.h"
-
+#ifdef XCP
+#include "utils/memutils.h"
+#endif
/* Does att's datatype allow packing into the 1-byte-header varlena format? */
#define ATT_IS_PACKABLE(att) \
@@ -1154,14 +1161,23 @@ slot_deform_datarow(TupleTableSlot *slot)
int attnum;
int i;
int col_count;
+#ifdef XCP
+ char *cur = slot->tts_datarow->msg;
+#else
char *cur = slot->tts_dataRow;
+#endif
StringInfo buffer;
uint16 n16;
uint32 n32;
MemoryContext oldcontext;
+#ifdef XCP
+ if (slot->tts_tupleDescriptor == NULL || slot->tts_datarow == NULL)
+ return;
+#else
if (slot->tts_tupleDescriptor == NULL || slot->tts_dataRow == NULL)
return;
+#endif
attnum = slot->tts_tupleDescriptor->natts;
@@ -1169,7 +1185,10 @@ slot_deform_datarow(TupleTableSlot *slot)
if (slot->tts_nvalid == attnum)
return;
+#ifndef XCP
+ /* XCP: Can not happen, we return earlier if condition not true */
Assert(slot->tts_dataRow);
+#endif
memcpy(&n16, cur, 2);
cur += 2;
@@ -1180,6 +1199,30 @@ slot_deform_datarow(TupleTableSlot *slot)
(errcode(ERRCODE_DATA_CORRUPTED),
errmsg("Tuple does not match the descriptor")));
+#ifdef XCP
+ if (slot->tts_attinmeta == NULL)
+ {
+ /*
+ * Ensure info about input functions is available as long as slot lives
+ */
+ oldcontext = MemoryContextSwitchTo(slot->tts_mcxt);
+ slot->tts_attinmeta = TupleDescGetAttInMetadata(slot->tts_tupleDescriptor);
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ /*
+ * Store values to separate context to easily free them when base datarow is
+ * freed
+ */
+ if (slot->tts_drowcxt == NULL)
+ {
+ slot->tts_drowcxt = AllocSetContextCreate(slot->tts_mcxt,
+ "Datarow",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ }
+#else
/*
* Ensure info about input functions is available as long as slot lives
* as well as deformed values
@@ -1188,10 +1231,12 @@ slot_deform_datarow(TupleTableSlot *slot)
if (slot->tts_attinmeta == NULL)
slot->tts_attinmeta = TupleDescGetAttInMetadata(slot->tts_tupleDescriptor);
+#endif
buffer = makeStringInfo();
for (i = 0; i < attnum; i++)
{
+ Form_pg_attribute attr = slot->tts_tupleDescriptor->attrs[i];
int len;
/* get size */
@@ -1217,6 +1262,48 @@ slot_deform_datarow(TupleTableSlot *slot)
slot->tts_isnull[i] = false;
resetStringInfo(buffer);
+
+#ifdef XCP
+ /*
+ * The input function was executed in caller's memory context,
+ * because it may be allocating working memory, and caller may
+ * want to clean it up.
+ * However returned Datums need to be in the special context, so
+ * if attribute is pass-by-reference, copy it.
+ */
+ if (!attr->attbyval)
+ {
+ Pointer val = DatumGetPointer(slot->tts_values[i]);
+ Size data_length;
+ void *data;
+
+ if (attr->attlen == -1)
+ {
+ /* varlena */
+ if (VARATT_IS_EXTERNAL(val))
+ /* no alignment, since it's short by definition */
+ data_length = VARSIZE_EXTERNAL(val);
+ else if (VARATT_IS_SHORT(val))
+ /* no alignment for short varlenas */
+ data_length = VARSIZE_SHORT(val);
+ else
+ data_length = VARSIZE(val);
+ }
+ else if (attr->attlen == -2)
+ {
+ /* cstring */
+ data_length = strlen(val) + 1;
+ }
+ else
+ {
+ /* fixed-length pass-by-reference */
+ data_length = attr->attlen;
+ }
+ data = MemoryContextAlloc(slot->tts_drowcxt, data_length);
+ memcpy(data, val, data_length);
+ slot->tts_values[i] = (Datum) data;
+ }
+#endif
}
}
pfree(buffer->data);
@@ -1224,7 +1311,9 @@ slot_deform_datarow(TupleTableSlot *slot)
slot->tts_nvalid = attnum;
+#ifndef XCP
MemoryContextSwitchTo(oldcontext);
+#endif
}
#endif
@@ -1279,7 +1368,11 @@ slot_getattr(TupleTableSlot *slot, int attnum, bool *isnull)
#ifdef PGXC
/* If it is a data row tuple extract all and return requested */
+#ifdef XCP
+ if (slot->tts_datarow)
+#else
if (slot->tts_dataRow)
+#endif
{
slot_deform_datarow(slot);
*isnull = slot->tts_isnull[attnum - 1];
@@ -1359,7 +1452,11 @@ slot_getallattrs(TupleTableSlot *slot)
#ifdef PGXC
/* Handle the DataRow tuple case */
+#ifdef XCP
+ if (slot->tts_datarow)
+#else
if (slot->tts_dataRow)
+#endif
{
slot_deform_datarow(slot);
return;
@@ -1411,7 +1508,11 @@ slot_getsomeattrs(TupleTableSlot *slot, int attnum)
#ifdef PGXC
/* Handle the DataRow tuple case */
+#ifdef XCP
+ if (slot->tts_datarow)
+#else
if (slot->tts_dataRow)
+#endif
{
slot_deform_datarow(slot);
return;
@@ -1487,7 +1588,11 @@ slot_attisnull(TupleTableSlot *slot, int attnum)
#ifdef PGXC
/* If it is a data row tuple extract all and return requested */
+#ifdef XCP
+ if (slot->tts_datarow)
+#else
if (slot->tts_dataRow)
+#endif
{
slot_deform_datarow(slot);
return slot->tts_isnull[attnum - 1];
diff --git a/src/backend/access/common/printtup.c b/src/backend/access/common/printtup.c
index a1a49f987d..10a45a3146 100644
--- a/src/backend/access/common/printtup.c
+++ b/src/backend/access/common/printtup.c
@@ -5,6 +5,11 @@
* clients and standalone backends are supported here).
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -314,12 +319,20 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
* values, just send over the DataRow message as we received it from the
* Datanode
*/
+#ifdef XCP
+ if (slot->tts_datarow)
+ {
+ pq_putmessage('D', slot->tts_datarow->msg, slot->tts_datarow->msglen);
+ return;
+ }
+#else
if (slot->tts_dataRow)
{
pq_putmessage('D', slot->tts_dataRow, slot->tts_dataLen);
return;
}
#endif
+#endif
/* Set or update my derived attribute info, if needed */
if (myState->attrinfo != typeinfo || myState->nattrs != natts)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 882fa6776b..d0dd340a71 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -23,6 +23,11 @@
* for aborts (whether sync or async), since the post-crash assumption would
* be that such transactions failed anyway.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -353,10 +358,21 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
* Current state change should be from 0 or subcommitted to target state
* or we should already be there when replaying changes during recovery.
*/
+#ifdef XCP
+ if (!(curval == 0 ||
+ (curval == TRANSACTION_STATUS_SUB_COMMITTED &&
+ status != TRANSACTION_STATUS_IN_PROGRESS) ||
+ curval == status))
+ {
+ elog(WARNING, "Unexpected clog condition. curval = %d, status = %d",
+ curval, status);
+ }
+#else
Assert(curval == 0 ||
(curval == TRANSACTION_STATUS_SUB_COMMITTED &&
status != TRANSACTION_STATUS_IN_PROGRESS) ||
curval == status);
+#endif
/* note this assumes exclusive access to the clog page */
byteval = *byteptr;
diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c
index b425ce434d..bcfc931b17 100644
--- a/src/backend/access/transam/gtm.c
+++ b/src/backend/access/transam/gtm.c
@@ -18,10 +18,24 @@
#include "utils/elog.h"
#include "miscadmin.h"
#include "pgxc/pgxc.h"
-
+#ifdef XCP
+#include "postgres.h"
+#include "gtm/gtm_c.h"
+#include "postmaster/autovacuum.h"
+#include "storage/backendid.h"
+#include "utils/lsyscache.h"
+
+/* To access sequences */
+#define MyCoordName \
+ OidIsValid(MyCoordId) ? get_pgxc_nodename(MyCoordId) : ""
+#endif
/* Configuration variables */
char *GtmHost = "localhost";
int GtmPort = 6666;
+#ifdef XCP
+bool IsXidFromGTM = false;
+#endif
+
extern bool FirstSnapshotSet;
static GTM_Conn *conn;
@@ -97,6 +111,11 @@ InitGTM(void)
CloseGTM();
}
+
+#ifdef XCP
+ else if (IS_PGXC_COORDINATOR)
+ register_session(conn, PGXCNodeName, MyProcPid, MyBackendId);
+#endif
}
void
@@ -136,6 +155,10 @@ BeginTranGTM(GTM_Timestamp *timestamp)
if (conn)
xid = begin_transaction(conn, GTM_ISOLATION_RC, timestamp);
}
+#ifdef XCP
+ if (xid)
+ IsXidFromGTM = true;
+#endif
currentGxid = xid;
return xid;
}
@@ -173,6 +196,10 @@ CommitTranGTM(GlobalTransactionId gxid)
if (!GlobalTransactionIdIsValid(gxid))
return 0;
CheckConnection();
+#ifdef XCP
+ ret = -1;
+ if (conn)
+#endif
ret = commit_transaction(conn, gxid);
/*
@@ -184,6 +211,10 @@ CommitTranGTM(GlobalTransactionId gxid)
{
CloseGTM();
InitGTM();
+#ifdef XCP
+ if (conn)
+ ret = commit_transaction(conn, gxid);
+#endif
}
/* Close connection in case commit is done by autovacuum worker or launcher */
@@ -206,6 +237,10 @@ CommitPreparedTranGTM(GlobalTransactionId gxid, GlobalTransactionId prepared_gxi
if (!GlobalTransactionIdIsValid(gxid) || !GlobalTransactionIdIsValid(prepared_gxid))
return ret;
CheckConnection();
+#ifdef XCP
+ ret = -1;
+ if (conn)
+#endif
ret = commit_prepared_transaction(conn, gxid, prepared_gxid);
/*
@@ -218,6 +253,10 @@ CommitPreparedTranGTM(GlobalTransactionId gxid, GlobalTransactionId prepared_gxi
{
CloseGTM();
InitGTM();
+#ifdef XCP
+ if (conn)
+ ret = commit_prepared_transaction(conn, gxid, prepared_gxid);
+#endif
}
currentGxid = InvalidGlobalTransactionId;
return ret;
@@ -244,6 +283,10 @@ RollbackTranGTM(GlobalTransactionId gxid)
{
CloseGTM();
InitGTM();
+#ifdef XCP
+ if (conn)
+ ret = abort_transaction(conn, gxid);
+#endif
}
currentGxid = InvalidGlobalTransactionId;
@@ -261,6 +304,10 @@ StartPreparedTranGTM(GlobalTransactionId gxid,
return 0;
CheckConnection();
+#ifdef XCP
+ ret = -1;
+ if (conn)
+#endif
ret = start_prepared_transaction(conn, gxid, gid, nodestring);
/*
@@ -272,6 +319,10 @@ StartPreparedTranGTM(GlobalTransactionId gxid,
{
CloseGTM();
InitGTM();
+#ifdef XCP
+ if (conn)
+ ret = start_prepared_transaction(conn, gxid, gid, nodestring);
+#endif
}
return ret;
@@ -285,6 +336,10 @@ PrepareTranGTM(GlobalTransactionId gxid)
if (!GlobalTransactionIdIsValid(gxid))
return 0;
CheckConnection();
+#ifdef XCP
+ ret = -1;
+ if (conn)
+#endif
ret = prepare_transaction(conn, gxid);
/*
@@ -296,6 +351,10 @@ PrepareTranGTM(GlobalTransactionId gxid)
{
CloseGTM();
InitGTM();
+#ifdef XCP
+ if (conn)
+ ret = prepare_transaction(conn, gxid);
+#endif
}
currentGxid = InvalidGlobalTransactionId;
return ret;
@@ -311,6 +370,10 @@ GetGIDDataGTM(char *gid,
int ret = 0;
CheckConnection();
+#ifdef XCP
+ ret = -1;
+ if (conn)
+#endif
ret = get_gid_data(conn, GTM_ISOLATION_RC, gid, gxid,
prepared_gxid, nodestring);
@@ -323,6 +386,11 @@ GetGIDDataGTM(char *gid,
{
CloseGTM();
InitGTM();
+#ifdef XCP
+ if (conn)
+ ret = get_gid_data(conn, GTM_ISOLATION_RC, gid, gxid,
+ prepared_gxid, nodestring);
+#endif
}
return ret;
@@ -339,6 +407,10 @@ GetSnapshotGTM(GlobalTransactionId gxid, bool canbe_grouped)
{
CloseGTM();
InitGTM();
+#ifdef XCP
+ if (conn)
+ ret_snapshot = get_snapshot(conn, gxid, canbe_grouped);
+#endif
}
return ret_snapshot;
}
@@ -374,26 +446,105 @@ AlterSequenceGTM(char *seqname, GTM_Sequence increment, GTM_Sequence minval,
return conn ? alter_sequence(conn, &seqkey, increment, minval, maxval, startval, lastval, cycle, is_restart) : 0;
}
+/*
+ * get the current sequence value
+ */
+
+GTM_Sequence
+GetCurrentValGTM(char *seqname)
+{
+ GTM_Sequence ret = -1;
+ GTM_SequenceKeyData seqkey;
+#ifdef XCP
+ char *coordName = IS_PGXC_COORDINATOR ? PGXCNodeName : MyCoordName;
+ int coordPid = IS_PGXC_COORDINATOR ? MyProcPid : MyCoordPid;
+ int status;
+#endif
+ CheckConnection();
+ seqkey.gsk_keylen = strlen(seqname) + 1;
+ seqkey.gsk_key = seqname;
+
+#ifdef XCP
+ if (conn)
+ status = get_current(conn, &seqkey, coordName, coordPid, &ret);
+ else
+ status = GTM_RESULT_COMM_ERROR;
+
+ /* retry once */
+ if (status == GTM_RESULT_COMM_ERROR)
+ {
+ CloseGTM();
+ InitGTM();
+ if (conn)
+ status = get_current(conn, &seqkey, coordName, coordPid, &ret);
+ }
+ if (status != GTM_RESULT_OK)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("%s", GTMPQerrorMessage(conn))));
+#else
+ if (conn)
+ ret = get_current(conn, &seqkey);
+
+ if (ret < 0)
+ {
+ CloseGTM();
+ InitGTM();
+ }
+#endif
+ return ret;
+}
/*
* Get the next sequence value
*/
GTM_Sequence
+#ifdef XCP
+GetNextValGTM(char *seqname, GTM_Sequence range, GTM_Sequence *rangemax)
+#else
GetNextValGTM(char *seqname)
+#endif
{
GTM_Sequence ret = -1;
GTM_SequenceKeyData seqkey;
+#ifdef XCP
+ char *coordName = IS_PGXC_COORDINATOR ? PGXCNodeName : MyCoordName;
+ int coordPid = IS_PGXC_COORDINATOR ? MyProcPid : MyCoordPid;
+ int status;
+#endif
CheckConnection();
seqkey.gsk_keylen = strlen(seqname) + 1;
seqkey.gsk_key = seqname;
+#ifdef XCP
+ if (conn)
+ status = get_next(conn, &seqkey, coordName,
+ coordPid, range, &ret, rangemax);
+ else
+ status = GTM_RESULT_COMM_ERROR;
+
+ /* retry once */
+ if (status == GTM_RESULT_COMM_ERROR)
+ {
+ CloseGTM();
+ InitGTM();
+ if (conn)
+ status = get_next(conn, &seqkey, coordName, coordPid,
+ range, &ret, rangemax);
+ }
+ if (status != GTM_RESULT_OK)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("%s", GTMPQerrorMessage(conn))));
+#else
if (conn)
- ret = get_next(conn, &seqkey);
+ ret = get_next(conn, &seqkey);
if (ret < 0)
{
CloseGTM();
InitGTM();
}
+#endif
return ret;
}
@@ -404,11 +555,19 @@ int
SetValGTM(char *seqname, GTM_Sequence nextval, bool iscalled)
{
GTM_SequenceKeyData seqkey;
+#ifdef XCP
+ char *coordName = IS_PGXC_COORDINATOR ? PGXCNodeName : MyCoordName;
+ int coordPid = IS_PGXC_COORDINATOR ? MyProcPid : MyCoordPid;
+#endif
CheckConnection();
seqkey.gsk_keylen = strlen(seqname) + 1;
seqkey.gsk_key = seqname;
+#ifdef XCP
+ return conn ? set_val(conn, &seqkey, coordName, coordPid, nextval, iscalled) : -1;
+#else
return conn ? set_val(conn, &seqkey, nextval, iscalled) : -1;
+#endif
}
/*
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 768781edf9..5fe2369344 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -3,6 +3,11 @@
* varsup.c
* postgres OID & XID variables support routines
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Copyright (c) 2000-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
@@ -72,6 +77,33 @@ GetForceXidFromGTM(void)
}
#endif /* PGXC */
+
+#ifdef XCP
+/*
+ * Check if GlobalTransactionId associated with the current distributed session
+ * equals to specified xid.
+ * It is for tuple visibility checks in secondary datanode sessions, which are
+ * not associating next_xid with the current transaction.
+ */
+bool
+TransactionIdIsCurrentGlobalTransactionId(TransactionId xid)
+{
+ return TransactionIdIsValid(next_xid) && TransactionIdEquals(xid, next_xid);
+}
+
+
+/*
+ * Returns GlobalTransactionId associated with the current distributed session
+ * without assigning it to the transaction.
+ */
+TransactionId
+GetNextTransactionId(void)
+{
+ return next_xid;
+}
+#endif
+
+
/*
* Allocate the next XID for a new transaction or subtransaction.
*
@@ -87,15 +119,17 @@ TransactionId
GetNewTransactionId(bool isSubXact, bool *timestamp_received, GTM_Timestamp *timestamp)
#else
GetNewTransactionId(bool isSubXact)
-#endif
+#endif /* PGXC */
{
TransactionId xid;
#ifdef PGXC
bool increment_xid = true;
-
*timestamp_received = false;
+#ifdef XCP
+ /* Will be set if we obtain from GTM */
+ IsXidFromGTM = false;
#endif
-
+#endif /* PGXC */
/*
* During bootstrap initialization, we return the special bootstrap
* transaction id.
@@ -128,7 +162,7 @@ GetNewTransactionId(bool isSubXact)
xid = (TransactionId) BeginTranGTM(timestamp);
*timestamp_received = true;
}
-#endif
+#endif /* PGXC */
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
@@ -165,8 +199,16 @@ GetNewTransactionId(bool isSubXact)
}
}
else if(IS_PGXC_DATANODE || IsConnFromCoord())
- {
+ {
+#ifdef XCP
+ /*
+ * (IS_PGXC_DATANODE && IsInitProcessingMode() && IsPostmasterEnvironment)
+ * handles new connections, ensures XID is consumed then, but not during initdb
+ */
+ if (IsAutoVacuumWorkerProcess() || IsAutoVacuumLauncherProcess() || (IS_PGXC_DATANODE && IsInitProcessingMode() && IsPostmasterEnvironment))
+#else
if (IsAutoVacuumWorkerProcess())
+#endif
{
/*
* For an autovacuum worker process, get transaction ID directly from GTM.
@@ -180,7 +222,7 @@ GetNewTransactionId(bool isSubXact)
next_xid = (TransactionId) BeginTranGTM(timestamp);
}
else if (GetForceXidFromGTM())
- {
+ {
elog (DEBUG1, "Force get XID from GTM");
/* try and get gxid directly from GTM */
next_xid = (TransactionId) BeginTranGTM(NULL);
@@ -204,12 +246,16 @@ GetNewTransactionId(bool isSubXact)
}
else
ShmemVariableCache->nextXid = xid;
- }
- else
+ }
+ else
{
- /* Fallback to default */
- elog(LOG, "Falling back to local Xid. Was = %d, now is = %d",
- next_xid, ShmemVariableCache->nextXid);
+ if (IsConnFromCoord())
+ {
+ elog(ERROR, "Coordinator has not provided xid for the command");
+ }
+ /* Fallback to default, needed for initdb */
+ elog(LOG, "Falling back to local Xid. Was = %d, now is = %d. autovacLaunch = %d",
+ next_xid, ShmemVariableCache->nextXid, IsAutoVacuumLauncherProcess());
xid = ShmemVariableCache->nextXid;
}
}
@@ -217,7 +263,6 @@ GetNewTransactionId(bool isSubXact)
xid = ShmemVariableCache->nextXid;
#endif /* PGXC */
-
/*----------
* Check to see if it's safe to assign another XID. This protects against
* catastrophic data loss due to XID wraparound. The basic rules are:
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index d55005f43f..055bd97f18 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -5,6 +5,11 @@
*
* See src/backend/access/transam/README for more information.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -24,10 +29,12 @@
#ifdef PGXC
#include "pgxc/pgxc.h"
#include "access/gtm.h"
-#include "pgxc/xc_maintenance_mode.h"
/* PGXC_COORD */
#include "gtm/gtm_c.h"
#include "pgxc/execRemote.h"
+#ifdef XCP
+#include "pgxc/pause.h"
+#endif
/* PGXC_DATANODE */
#include "postmaster/autovacuum.h"
#include "libpq/pqformat.h"
@@ -67,6 +74,11 @@
#include "pg_trace.h"
+#ifdef XCP
+#define implicit2PC_head "_$XC$"
+#endif
+
+
/*
* User-tweakable parameters
*/
@@ -148,8 +160,10 @@ typedef struct TransactionStateData
GlobalTransactionId transactionId;
GlobalTransactionId topGlobalTransansactionId;
GlobalTransactionId auxilliaryTransactionId;
+#ifndef XCP
bool isLocalParameterUsed; /* Check if a local parameter is active
* in transaction block (SET LOCAL, DEFERRED) */
+#endif
#else
TransactionId transactionId; /* my XID, or Invalid if none */
#endif
@@ -184,7 +198,9 @@ static TransactionStateData TopTransactionStateData = {
0, /* global transaction id */
0, /* prepared global transaction id */
0, /* commit prepared global transaction id */
+#ifndef XCP
false, /* isLocalParameterUsed */
+#endif
#else
0, /* transaction id */
#endif
@@ -270,6 +286,9 @@ static TimestampTz GTMdeltaTimestamp = 0;
*/
static char *prepareGID;
static char *savePrepareGID;
+#ifdef XCP
+static char *saveNodeString = NULL;
+#endif
static bool XactLocalNodePrepared;
static bool XactReadLocalNode;
static bool XactWriteLocalNode;
@@ -455,6 +474,15 @@ GetCurrentTransactionId(void)
{
TransactionState s = CurrentTransactionState;
+#ifdef XCP
+ /*
+ * Never assign xid to the secondary session, that causes conflicts when
+ * writing to the clog at the transaction end.
+ */
+ if (IsConnFromDatanode())
+ return GetNextTransactionId();
+#endif
+
if (!TransactionIdIsValid(s->transactionId))
AssignTransactionId(s);
return s->transactionId;
@@ -501,6 +529,7 @@ GetStableLatestTransactionId(void)
}
#ifdef PGXC
+#ifndef XCP
/*
* GetCurrentLocalParamStatus
*
@@ -524,6 +553,7 @@ SetCurrentLocalParamStatus(bool status)
CurrentTransactionState->isLocalParameterUsed = status;
}
#endif
+#endif
/*
* AssignTransactionId
@@ -595,7 +625,7 @@ AssignTransactionId(TransactionState s)
}
#else
s->transactionId = GetNewTransactionId(isSubXact);
-#endif
+#endif /* PGXC */
if (isSubXact)
SubTransSetParent(s->transactionId, s->parent->transactionId, false);
@@ -722,6 +752,7 @@ GetCurrentSubTransactionId(void)
return s->subTransactionId;
}
+
/*
* GetCurrentCommandId
*
@@ -735,7 +766,11 @@ GetCurrentCommandId(bool used)
{
#ifdef PGXC
/* If coordinator has sent a command id, remote node should use it */
+#ifdef XCP
+ if (isCommandIdReceived)
+#else
if (IsConnFromCoord() && isCommandIdReceived)
+#endif
{
/*
* Indicate to successive calls of this function that the sent command id has
@@ -910,6 +945,16 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
if (!TransactionIdIsNormal(xid))
return false;
+#ifdef XCP
+ /*
+ * The current TransactionId of secondary datanode session is never
+ * associated with the current transaction, so if it is a secondary
+ * Datanode session look into xid sent from the parent.
+ */
+ if (IsConnFromDatanode() && TransactionIdIsCurrentGlobalTransactionId(xid))
+ return true;
+#endif
+
/*
* We will return true for the Xid of the current subtransaction, any of
* its subcommitted children, any of its parents, or any of their
@@ -1968,8 +2013,10 @@ StartTransaction(void)
*/
s->state = TRANS_START;
#ifdef PGXC
+#ifndef XCP
s->isLocalParameterUsed = false;
#endif
+#endif
s->transactionId = InvalidTransactionId; /* until assigned */
/*
* Make sure we've reset xact state variables
@@ -1990,8 +2037,7 @@ StartTransaction(void)
XactReadOnly = DefaultXactReadOnly;
#ifdef PGXC
/* Save Postgres-XC session as read-only if necessary */
- if (!xc_maintenance_mode)
- XactReadOnly |= IsPGXCNodeXactReadOnly();
+ XactReadOnly |= IsPGXCNodeXactReadOnly();
#endif
}
XactDeferrable = DefaultXactDeferrable;
@@ -2153,6 +2199,15 @@ CommitTransaction(void)
savePrepareGID = NULL;
}
+#ifdef XCP
+ if (saveNodeString)
+ {
+ pfree(saveNodeString);
+ saveNodeString = NULL;
+ }
+#endif
+
+#ifndef XCP
/*
* Check if there are any ON COMMIT actions or if temporary objects are in use.
* If session is set-up to enforce 2PC for such transactions, return an error.
@@ -2168,16 +2223,28 @@ CommitTransaction(void)
errmsg("cannot PREPARE a transaction that has operated on temporary tables"),
errdetail("Disabling enforce_two_phase_commit is recommended to enforce COMMIT")));
}
+#endif
/*
* If the local node has done some write activity, prepare the local node
* first. If that fails, the transaction is aborted on all the remote
* nodes
*/
+#ifdef XCP
+ /*
+ * Fired OnCommit actions would fail 2PC process
+ */
+ if (!IsOnCommitActions() && IsTwoPhaseCommitRequired(XactWriteLocalNode))
+#else
if (IsTwoPhaseCommitRequired(XactWriteLocalNode))
+#endif
{
prepareGID = MemoryContextAlloc(TopTransactionContext, 256);
+#ifdef XCP
+ sprintf(prepareGID, implicit2PC_head"%u", GetTopTransactionId());
+#else
sprintf(prepareGID, "T%u", GetTopTransactionId());
+#endif
savePrepareGID = MemoryContextStrdup(TopMemoryContext, prepareGID);
@@ -2205,7 +2272,14 @@ CommitTransaction(void)
s->auxilliaryTransactionId = GetTopTransactionId();
}
else
+#ifdef XCP
+ {
+ s->auxilliaryTransactionId = InvalidGlobalTransactionId;
+ PrePrepare_Remote(prepareGID, false, true);
+ }
+#else
s->auxilliaryTransactionId = InvalidGlobalTransactionId;
+#endif
}
}
#endif
@@ -2266,14 +2340,21 @@ CommitTransaction(void)
PreCommit_Notify();
#ifdef PGXC
+#ifdef XCP
+ if (IS_PGXC_DATANODE || !IsConnFromCoord())
+#else
if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
+#endif
{
/*
* Now run 2PC on the remote nodes. Any errors will be reported via
* ereport and we will run error recovery as part of AbortTransaction
*/
+#ifdef XCP
+ PreCommit_Remote(savePrepareGID, saveNodeString, XactLocalNodePrepared);
+#else
PreCommit_Remote(savePrepareGID, XactLocalNodePrepared);
-
+#endif
/*
* Now that all the remote nodes have successfully prepared and
* commited, commit the local transaction as well. Remember, any errors
@@ -2392,6 +2473,16 @@ CommitTransaction(void)
AtEOXact_MultiXact();
+#ifdef XCP
+ /* If the cluster lock was held at commit time, keep it locked! */
+ if (cluster_ex_lock_held)
+ {
+ elog(DEBUG2, "PAUSE CLUSTER still held at commit");
+ /*if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
+ RequestClusterPause(false, NULL);*/
+ }
+#endif
+
ResourceOwnerRelease(TopTransactionResourceOwner,
RESOURCE_RELEASE_LOCKS,
true, true);
@@ -2436,7 +2527,9 @@ CommitTransaction(void)
s->maxChildXids = 0;
#ifdef PGXC
+#ifndef XCP
s->isLocalParameterUsed = false;
+#endif
ForgetTransactionLocalNode();
/*
@@ -2506,6 +2599,27 @@ AtEOXact_GlobalTxn(bool commit)
RollbackTranGTM(s->topGlobalTransansactionId);
}
}
+#ifdef XCP
+ /*
+ * If GTM is connected the current gxid is acquired from GTM directly.
+ * So directly report transaction end. However this applies only if
+ * the connection is directly from a client.
+ */
+ else if (IsXidFromGTM)
+ {
+ IsXidFromGTM = false;
+ if (commit)
+ CommitTranGTM(s->topGlobalTransansactionId);
+ else
+ RollbackTranGTM(s->topGlobalTransansactionId);
+
+ if (IsGTMConnected() &&
+ !IsConnFromCoord() && !IsConnFromDatanode())
+ {
+ CloseGTM();
+ }
+ }
+#else
else if (IS_PGXC_DATANODE || IsConnFromCoord())
{
/* If we are autovacuum, commit on GTM */
@@ -2525,7 +2639,7 @@ AtEOXact_GlobalTxn(bool commit)
RollbackTranGTM(currentGxid);
}
}
-
+#endif
s->topGlobalTransansactionId = InvalidGlobalTransactionId;
s->auxilliaryTransactionId = InvalidGlobalTransactionId;
@@ -2552,8 +2666,10 @@ PrepareTransaction(void)
TimestampTz prepared_at;
#ifdef PGXC
bool isImplicit = !(s->blockState == TBLOCK_PREPARE);
+#ifndef XCP
char *nodestring = NULL;
#endif
+#endif
ShowTransactionState("PrepareTransaction");
@@ -2566,6 +2682,7 @@ PrepareTransaction(void)
Assert(s->parent == NULL);
#ifdef PGXC
+#ifndef XCP
if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
{
if (savePrepareGID)
@@ -2581,6 +2698,7 @@ PrepareTransaction(void)
CallGTMCallbacks(GTM_EVENT_PREPARE);
}
#endif
+#endif
/*
* Do pre-commit processing that involves calling user-defined code, such
@@ -2604,6 +2722,35 @@ PrepareTransaction(void)
break;
}
+#ifdef XCP
+ /*
+ * Remote nodes must be done AFTER portals. If portal is still active it may
+ * need to send down a message to close remote objects on Datanode, but
+ * PrePrepare_Remote releases connections to remote nodes.
+ */
+ if (IS_PGXC_DATANODE || !IsConnFromCoord())
+ {
+ char *nodestring;
+ if (saveNodeString)
+ pfree(saveNodeString);
+
+ /* Needed in PrePrepare_Remote to submit nodes to GTM */
+ s->topGlobalTransansactionId = s->transactionId;
+ if (savePrepareGID)
+ pfree(savePrepareGID);
+ savePrepareGID = MemoryContextStrdup(TopMemoryContext, prepareGID);
+ nodestring = PrePrepare_Remote(savePrepareGID, XactWriteLocalNode, isImplicit);
+ if (nodestring)
+ saveNodeString = MemoryContextStrdup(TopMemoryContext, nodestring);
+
+ /*
+ * Callback on GTM if necessary, this needs to be done before HOLD_INTERRUPTS
+ * as this is not a part of the end of transaction processing involving clean up.
+ */
+ CallGTMCallbacks(GTM_EVENT_PREPARE);
+ }
+#endif
+
/*
* The remaining actions cannot call any user-defined code, so it's safe
* to start shutting down within-transaction services. But note that most
@@ -2827,7 +2974,11 @@ PrepareTransaction(void)
*/
if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
{
+#ifdef XCP
+ PostPrepare_Remote(savePrepareGID, isImplicit);
+#else
PostPrepare_Remote(savePrepareGID, nodestring, isImplicit);
+#endif
if (!isImplicit)
s->topGlobalTransansactionId = InvalidGlobalTransactionId;
ForgetTransactionLocalNode();
@@ -5798,7 +5949,9 @@ IsTransactionLocalNode(bool write)
bool
IsXidImplicit(const char *xid)
{
+#ifndef XCP
#define implicit2PC_head "_$XC$"
+#endif
const size_t implicit2PC_head_len = strlen(implicit2PC_head);
if (strncmp(xid, implicit2PC_head, implicit2PC_head_len))
@@ -5820,7 +5973,9 @@ SaveReceivedCommandId(CommandId cid)
* Change command ID information status to report any changes in remote ID
* for a remote node. A new command ID has also been received.
*/
+#ifndef XCP
if (IsConnFromCoord())
+#endif
{
SetSendCommandId(true);
isCommandIdReceived = true;
@@ -5899,9 +6054,9 @@ IsPGXCNodeXactReadOnly(void)
* For the time being a Postgres-XC session is read-only
* under very specific conditions.
* This is the case of an application accessing directly
- * a Datanode.
+ * a Datanode provided the server was not started in restore mode.
*/
- return IsPGXCNodeXactDatanodeDirect();
+ return IsPGXCNodeXactDatanodeDirect() && !isRestoreMode;
}
/*
@@ -5929,6 +6084,9 @@ IsPGXCNodeXactDatanodeDirect(void)
(IsPostmasterEnvironment || !useLocalXid) &&
IsNormalProcessingMode() &&
!IsAutoVacuumLauncherProcess() &&
+#ifdef XCP
+ !IsConnFromDatanode() &&
+#endif
!IsConnFromCoord();
}
#endif
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 841ebec63b..03ff91e046 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -4,6 +4,11 @@
* routines to support running postgres in 'bootstrap' mode
* bootstrap mode is used to create the initial template database
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index fb3ca97994..ec884289cb 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -72,6 +72,7 @@ install-data: $(BKIFILES) installdirs
$(INSTALL_DATA) $(call vpathsearch,postgres.shdescription) '$(DESTDIR)$(datadir)/postgres.shdescription'
$(INSTALL_DATA) $(srcdir)/system_views.sql '$(DESTDIR)$(datadir)/system_views.sql'
$(INSTALL_DATA) $(srcdir)/information_schema.sql '$(DESTDIR)$(datadir)/information_schema.sql'
+ $(INSTALL_DATA) $(srcdir)/storm_catalog.sql '$(DESTDIR)$(datadir)/storm_catalog.sql'
$(INSTALL_DATA) $(srcdir)/sql_features.txt '$(DESTDIR)$(datadir)/sql_features.txt'
installdirs:
diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c
index 96aba2e0ae..c13a3f39f6 100644
--- a/src/backend/catalog/catalog.c
+++ b/src/backend/catalog/catalog.c
@@ -5,6 +5,11 @@
* bits of hard-wired knowledge
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -602,6 +607,11 @@ GetNewRelFileNode(Oid reltablespace, Relation pg_class, char relpersistence)
switch (relpersistence)
{
case RELPERSISTENCE_TEMP:
+#ifdef XCP
+ if (OidIsValid(MyCoordId))
+ backend = MyFirstBackendId;
+ else
+#endif
backend = MyBackendId;
break;
case RELPERSISTENCE_UNLOGGED:
diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index 29f324369e..c6637e0f71 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -4,6 +4,11 @@
* Routines to support inter-object dependencies.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -446,7 +451,7 @@ performRename(const ObjectAddress *object, const char *oldname, const char *newn
NULL, /* empty stack */
targetObjects,
NULL,
- depRel);
+ &depRel);
/* Check Objects one by one to see if some of them have to be renamed on GTM */
for (i = 0; i < targetObjects->numrefs; i++)
@@ -1264,6 +1269,7 @@ doDeletion(const ObjectAddress *object, int flags)
break;
case RELKIND_RELATION:
case RELKIND_VIEW:
+#ifndef XCP
/*
* Flag temporary objects in use in case a temporary table or view
* is dropped by dependency. This check is particularly useful with
@@ -1273,6 +1279,7 @@ doDeletion(const ObjectAddress *object, int flags)
*/
if (IsTempTable(object->objectId))
ExecSetTempObjectIncluded();
+#endif
break;
default:
break;
diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl
index e1e5374884..adb3d81cec 100644
--- a/src/backend/catalog/genbki.pl
+++ b/src/backend/catalog/genbki.pl
@@ -220,7 +220,7 @@ foreach my $catname ( @{ $catalogs->{names} } )
{cmax => 'cid'},
{tableoid => 'oid'}
#PGXC_BEGIN
- ,{xc_node_id => 'int4'}
+ ,{ xc_node_id => 'int4' }
#PGXC_END
);
foreach my $attr (@SYS_ATTRS)
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index e5d05418bd..6741f90a3f 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -77,6 +77,7 @@
#include "pgxc/nodemgr.h"
#include "pgxc/pgxc.h"
#include "pgxc/pgxcnode.h"
+#include "pgxc/postgresql_fdw.h"
#endif
@@ -1033,7 +1034,7 @@ GetRelationDistributionItems(Oid relid,
for (i = 0; i < descriptor->natts; i++)
{
attr = descriptor->attrs[i];
- if (IsTypeDistributable(attr->atttypid))
+ if (IsTypeHashDistributable(attr->atttypid))
{
/* distribute on this column */
local_attnum = i + 1;
@@ -1065,7 +1066,7 @@ GetRelationDistributionItems(Oid relid,
errmsg("Invalid distribution column specified")));
}
- if (!IsTypeDistributable(descriptor->attrs[local_attnum - 1]->atttypid))
+ if (!IsTypeHashDistributable(descriptor->attrs[local_attnum - 1]->atttypid))
{
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
@@ -1088,7 +1089,7 @@ GetRelationDistributionItems(Oid relid,
errmsg("Invalid distribution column specified")));
}
- if (!IsTypeDistributable(descriptor->attrs[local_attnum - 1]->atttypid))
+ if (!IsTypeModuloDistributable(descriptor->attrs[local_attnum - 1]->atttypid))
{
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
index 997a6f35cd..dd5a9da65b 100644
--- a/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@ -9,6 +9,11 @@
* and implementing search-path-controlled searches.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -47,6 +52,9 @@
#include "parser/parse_func.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
+#ifdef XCP
+#include "storage/proc.h"
+#endif
#include "storage/sinval.h"
#include "utils/acl.h"
#include "utils/builtins.h"
@@ -196,6 +204,9 @@ static void RemoveTempRelationsCallback(int code, Datum arg);
static void NamespaceCallback(Datum arg, int cacheid, uint32 hashvalue);
static bool MatchNamedCall(HeapTuple proctup, int nargs, List *argnames,
int **argnumbers);
+#ifdef XCP
+static void FindTemporaryNamespace(void);
+#endif
/* These don't really need to appear in any header file */
Datum pg_table_is_visible(PG_FUNCTION_ARGS);
@@ -704,7 +715,11 @@ RelationIsVisible(Oid relid)
* list_member_oid() for them.
*/
relnamespace = relform->relnamespace;
+#ifdef XCP
+ if (relnamespace != PG_CATALOG_NAMESPACE && relnamespace != STORM_CATALOG_NAMESPACE &&
+#else
if (relnamespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, relnamespace))
visible = false;
else
@@ -799,7 +814,11 @@ TypeIsVisible(Oid typid)
* list_member_oid() for them.
*/
typnamespace = typform->typnamespace;
+#ifdef XCP
+ if (typnamespace != PG_CATALOG_NAMESPACE && typnamespace != STORM_CATALOG_NAMESPACE &&
+#else
if (typnamespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, typnamespace))
visible = false;
else
@@ -1389,7 +1408,11 @@ FunctionIsVisible(Oid funcid)
* list_member_oid() for them.
*/
pronamespace = procform->pronamespace;
+#ifdef XCP
+ if (pronamespace != PG_CATALOG_NAMESPACE && pronamespace != STORM_CATALOG_NAMESPACE &&
+#else
if (pronamespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, pronamespace))
visible = false;
else
@@ -1713,7 +1736,11 @@ OperatorIsVisible(Oid oprid)
* list_member_oid() for them.
*/
oprnamespace = oprform->oprnamespace;
+#ifdef XCP
+ if (oprnamespace != PG_CATALOG_NAMESPACE && oprnamespace != STORM_CATALOG_NAMESPACE &&
+#else
if (oprnamespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, oprnamespace))
visible = false;
else
@@ -1799,7 +1826,11 @@ OpclassIsVisible(Oid opcid)
* list_member_oid() for them.
*/
opcnamespace = opcform->opcnamespace;
+#ifdef XCP
+ if (opcnamespace != PG_CATALOG_NAMESPACE && opcnamespace != STORM_CATALOG_NAMESPACE &&
+#else
if (opcnamespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, opcnamespace))
visible = false;
else
@@ -1882,7 +1913,11 @@ OpfamilyIsVisible(Oid opfid)
* list_member_oid() for them.
*/
opfnamespace = opfform->opfnamespace;
+#ifdef XCP
+ if (opfnamespace != PG_CATALOG_NAMESPACE && opfnamespace != STORM_CATALOG_NAMESPACE &&
+#else
if (opfnamespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, opfnamespace))
visible = false;
else
@@ -1972,7 +2007,11 @@ CollationIsVisible(Oid collid)
* list_member_oid() for them.
*/
collnamespace = collform->collnamespace;
+#ifdef XCP
+ if (collnamespace != PG_CATALOG_NAMESPACE && collnamespace != STORM_CATALOG_NAMESPACE &&
+#else
if (collnamespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, collnamespace))
visible = false;
else
@@ -2054,7 +2093,11 @@ ConversionIsVisible(Oid conid)
* list_member_oid() for them.
*/
connamespace = conform->connamespace;
+#ifdef XCP
+ if (connamespace != PG_CATALOG_NAMESPACE && connamespace != STORM_CATALOG_NAMESPACE &&
+#else
if (connamespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, connamespace))
visible = false;
else
@@ -2156,7 +2199,11 @@ TSParserIsVisible(Oid prsId)
* list_member_oid() for them.
*/
namespace = form->prsnamespace;
+#ifdef XCP
+ if (namespace != PG_CATALOG_NAMESPACE && namespace != STORM_CATALOG_NAMESPACE &&
+#else
if (namespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, namespace))
visible = false;
else
@@ -2280,7 +2327,11 @@ TSDictionaryIsVisible(Oid dictId)
* list_member_oid() for them.
*/
namespace = form->dictnamespace;
+#ifdef XCP
+ if (namespace != PG_CATALOG_NAMESPACE && namespace != STORM_CATALOG_NAMESPACE &&
+#else
if (namespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, namespace))
visible = false;
else
@@ -2403,7 +2454,11 @@ TSTemplateIsVisible(Oid tmplId)
* list_member_oid() for them.
*/
namespace = form->tmplnamespace;
+#ifdef XCP
+ if (namespace != PG_CATALOG_NAMESPACE && namespace != STORM_CATALOG_NAMESPACE &&
+#else
if (namespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, namespace))
visible = false;
else
@@ -2527,7 +2582,11 @@ TSConfigIsVisible(Oid cfgid)
* list_member_oid() for them.
*/
namespace = form->cfgnamespace;
+#ifdef XCP
+ if (namespace != PG_CATALOG_NAMESPACE && namespace != STORM_CATALOG_NAMESPACE &&
+#else
if (namespace != PG_CATALOG_NAMESPACE &&
+#endif
!list_member_oid(activeSearchPath, namespace))
visible = false;
else
@@ -2639,13 +2698,22 @@ LookupNamespaceNoError(const char *nspname)
{
if (OidIsValid(myTempNamespace))
return myTempNamespace;
-
+#ifdef XCP
+ /*
+ * Try to find temporary namespace created by other backend of
+ * the same distributed session. If not found myTempNamespace will
+ * be InvalidOid, that is correct result.
+ */
+ FindTemporaryNamespace();
+ return myTempNamespace;
+#else
/*
* Since this is used only for looking up existing objects, there is
* no point in trying to initialize the temp namespace here; and doing
* so might create problems for some callers. Just report "not found".
*/
return InvalidOid;
+#endif
}
return get_namespace_oid(nspname, true);
@@ -2670,6 +2738,16 @@ LookupExplicitNamespace(const char *nspname)
if (OidIsValid(myTempNamespace))
return myTempNamespace;
+#ifdef XCP
+ /*
+ * Try to find temporary namespace created by other backend of
+ * the same distributed session.
+ */
+ FindTemporaryNamespace();
+ if (OidIsValid(myTempNamespace))
+ return myTempNamespace;
+#endif
+
/*
* Since this is used only for looking up existing objects, there is
* no point in trying to initialize the temp namespace here; and doing
@@ -3068,7 +3146,16 @@ GetOverrideSearchPath(MemoryContext context)
result->addTemp = true;
else
{
+#ifdef XCP
+ /*
+ * The while loop assumes that you can only have one catalog schema
+ * in the namespace. Not quite..
+ */
+ Assert(linitial_oid(schemas) == STORM_CATALOG_NAMESPACE ||
+ linitial_oid(schemas) == PG_CATALOG_NAMESPACE);
+#else
Assert(linitial_oid(schemas) == PG_CATALOG_NAMESPACE);
+#endif
result->addCatalog = true;
}
schemas = list_delete_first(schemas);
@@ -3145,7 +3232,14 @@ PushOverrideSearchPath(OverrideSearchPath *newpath)
* permissions for these.
*/
if (newpath->addCatalog)
+#ifdef XCP
+ {
+ oidlist = lcons_oid(PG_CATALOG_NAMESPACE, oidlist);
+ oidlist = lcons_oid(STORM_CATALOG_NAMESPACE, oidlist);
+ }
+#else
oidlist = lcons_oid(PG_CATALOG_NAMESPACE, oidlist);
+#endif
if (newpath->addTemp && OidIsValid(myTempNamespace))
oidlist = lcons_oid(myTempNamespace, oidlist);
@@ -3472,6 +3566,11 @@ recomputeNamespacePath(void)
if (!list_member_oid(oidlist, PG_CATALOG_NAMESPACE))
oidlist = lcons_oid(PG_CATALOG_NAMESPACE, oidlist);
+#ifdef XCP
+ if (!list_member_oid(oidlist, STORM_CATALOG_NAMESPACE))
+ oidlist = lcons_oid(STORM_CATALOG_NAMESPACE, oidlist);
+#endif
+
if (OidIsValid(myTempNamespace) &&
!list_member_oid(oidlist, myTempNamespace))
oidlist = lcons_oid(myTempNamespace, oidlist);
@@ -3550,6 +3649,16 @@ InitTempTableNamespace(void)
(errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),
errmsg("cannot create temporary tables during recovery")));
+#ifdef XCP
+ /*
+ * In case of distributed session use MyFirstBackendId for temp objects
+ */
+ if (OidIsValid(MyCoordId))
+ snprintf(namespaceName, sizeof(namespaceName), "pg_temp_%d",
+ MyFirstBackendId);
+ else
+ /* fallback to default */
+#endif
snprintf(namespaceName, sizeof(namespaceName), "pg_temp_%d", MyBackendId);
namespaceId = get_namespace_oid(namespaceName, true);
@@ -3582,6 +3691,16 @@ InitTempTableNamespace(void)
* it. (We assume there is no need to clean it out if it does exist, since
* dropping a parent table should make its toast table go away.)
*/
+#ifdef XCP
+ /*
+ * In case of distributed session use MyFirstBackendId for temp objects
+ */
+ if (OidIsValid(MyCoordId))
+ snprintf(namespaceName, sizeof(namespaceName), "pg_toast_temp_%d",
+ MyFirstBackendId);
+ else
+ /* fallback to default */
+#endif
snprintf(namespaceName, sizeof(namespaceName), "pg_toast_temp_%d",
MyBackendId);
@@ -3604,6 +3723,9 @@ InitTempTableNamespace(void)
/* It should not be done already. */
AssertState(myTempNamespaceSubID == InvalidSubTransactionId);
+#ifdef XCP
+ if (!OidIsValid(MyCoordId))
+#endif
myTempNamespaceSubID = GetCurrentSubTransactionId();
baseSearchPathValid = false; /* need to rebuild list */
@@ -3626,7 +3748,20 @@ AtEOXact_Namespace(bool isCommit)
if (myTempNamespaceSubID != InvalidSubTransactionId)
{
if (isCommit)
+#ifdef XCP
+ {
+ /*
+ * During backend lifetime it may be assigned to different
+ * distributed sessions, and each of them may create temp
+ * namespace and set a callback. That may cause memory leak.
+ * XXX is it ever possible to remove callbacks?
+ */
+ if (!OidIsValid(MyCoordId))
+ on_shmem_exit(RemoveTempRelationsCallback, 0);
+ }
+#else
on_shmem_exit(RemoveTempRelationsCallback, 0);
+#endif
else
{
myTempNamespace = InvalidOid;
@@ -3783,9 +3918,46 @@ ResetTempTableNamespace(void)
{
if (OidIsValid(myTempNamespace))
RemoveTempRelations(myTempNamespace);
+#ifdef XCP
+ else if (OidIsValid(MyCoordId))
+ {
+ char namespaceName[NAMEDATALEN];
+ Oid namespaceId;
+
+ snprintf(namespaceName, sizeof(namespaceName), "pg_temp_%d",
+ MyFirstBackendId);
+
+ namespaceId = get_namespace_oid(namespaceName, true);
+ if (OidIsValid(namespaceId))
+ RemoveTempRelations(namespaceId);
+ }
+#endif
}
+#ifdef XCP
+/*
+ * Reset myTempNamespace so it will be reinitialized after backend is assigned
+ * to a different session.
+ */
+void
+ForgetTempTableNamespace(void)
+{
+ /* If the namespace exists and need to be cleaned up do that */
+ if (OidIsValid(myTempNamespace) &&
+ myTempNamespaceSubID != InvalidSubTransactionId)
+ {
+ elog(WARNING, "leaked temp namespace clean up callback");
+ RemoveTempRelations(myTempNamespace);
+ }
+ myTempNamespace = InvalidOid;
+ myTempToastNamespace = InvalidOid;
+ baseSearchPathValid = false; /* need to rebuild list */
+ myTempNamespaceSubID = InvalidSubTransactionId;
+}
+#endif
+
+
/*
* Routines for handling the GUC variable 'search_path'.
*/
@@ -4121,3 +4293,43 @@ pg_is_other_temp_schema(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(isOtherTempNamespace(oid));
}
+
+
+#ifdef XCP
+/*
+ * FindTemporaryNamespace
+ * If this is secondary backend of distributed session check if primary backend
+ * of the same session created temporary namespace and wire it up if it is the
+ * case, instead of creating new.
+ */
+static void
+FindTemporaryNamespace(void)
+{
+ char namespaceName[NAMEDATALEN];
+
+ Assert(!OidIsValid(myTempNamespace));
+
+ /*
+ * We need distribution session identifier to find the namespace.
+ */
+ if (!OidIsValid(MyCoordId))
+ return;
+
+ /*
+ * Look up namespace by name. This code should be in synch with
+ * InitTempTableNamespace.
+ */
+ snprintf(namespaceName, sizeof(namespaceName), "pg_temp_%d",
+ MyFirstBackendId);
+ myTempNamespace = get_namespace_oid(namespaceName, true);
+ /* Same for the toast namespace */
+ if (OidIsValid(myTempNamespace))
+ {
+ snprintf(namespaceName, sizeof(namespaceName), "pg_toast_temp_%d",
+ MyFirstBackendId);
+ myTempToastNamespace = get_namespace_oid(namespaceName, true);
+ baseSearchPathValid = false; /* need to rebuild list */
+ }
+}
+#endif
+
diff --git a/src/backend/catalog/pg_aggregate.c b/src/backend/catalog/pg_aggregate.c
index 856adc3b0f..768732888c 100644
--- a/src/backend/catalog/pg_aggregate.c
+++ b/src/backend/catalog/pg_aggregate.c
@@ -3,6 +3,11 @@
* pg_aggregate.c
* routines to support manipulation of the pg_aggregate relation
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -53,6 +58,9 @@ AggregateCreate(const char *aggName,
List *aggfinalfnName,
List *aggsortopName,
Oid aggTransType,
+#ifdef XCP
+ Oid aggCollectType,
+#endif
#ifdef PGXC
const char *agginitval,
const char *agginitcollect)
@@ -172,6 +180,27 @@ AggregateCreate(const char *aggName,
ReleaseSysCache(tup);
#ifdef PGXC
+#ifdef XCP
+ if (aggcollectfnName)
+ {
+ /*
+ * Collection function must be of two arguments
+ * First must be of aggCollectType, second must be of aggTransType
+ * Return value must be of aggCollectType
+ */
+ fnArgs[0] = aggCollectType;
+ fnArgs[1] = aggTransType;
+ collectfn = lookup_agg_function(aggcollectfnName, 2, fnArgs,
+ &rettype);
+ if (rettype != aggCollectType)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATATYPE_MISMATCH),
+ errmsg("return type of collection function %s is not %s",
+ NameListToString(aggcollectfnName),
+ format_type_be(aggCollectType)
+ )));
+ }
+#else
if (aggcollectfnName)
{
/*
@@ -189,11 +218,16 @@ AggregateCreate(const char *aggName,
NameListToString(aggcollectfnName),
format_type_be(aggTransType))));
}
-
+#endif
#endif
/* handle finalfn, if supplied */
if (aggfinalfnName)
{
+#ifdef XCP
+ if (OidIsValid(aggCollectType))
+ fnArgs[0] = aggCollectType;
+ else
+#endif
fnArgs[0] = aggTransType;
finalfn = lookup_agg_function(aggfinalfnName, 1, fnArgs,
&finaltype);
@@ -203,6 +237,11 @@ AggregateCreate(const char *aggName,
/*
* If no finalfn, aggregate result type is type of the state value
*/
+#ifdef XCP
+ if (OidIsValid(aggCollectType))
+ finaltype = aggCollectType;
+ else
+#endif
finaltype = aggTransType;
}
Assert(OidIsValid(finaltype));
@@ -319,6 +358,9 @@ AggregateCreate(const char *aggName,
#ifdef PGXC
values[Anum_pg_aggregate_aggcollectfn - 1] = ObjectIdGetDatum(collectfn);
#endif
+#ifdef XCP
+ values[Anum_pg_aggregate_aggcollecttype - 1] = ObjectIdGetDatum(aggCollectType);
+#endif
if (agginitval)
values[Anum_pg_aggregate_agginitval - 1] = CStringGetTextDatum(agginitval);
else
diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c
index c758f63224..6363081b1f 100644
--- a/src/backend/catalog/pg_proc.c
+++ b/src/backend/catalog/pg_proc.c
@@ -3,6 +3,11 @@
* pg_proc.c
* routines to support manipulation of the pg_proc relation
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -39,6 +44,7 @@
#ifdef PGXC
#include "pgxc/execRemote.h"
#include "pgxc/pgxc.h"
+#include "pgxc/planner.h"
#endif
@@ -903,6 +909,7 @@ fmgr_sql_validator(PG_FUNCTION_ARGS)
pinfo);
#ifdef PGXC
+#ifndef XCP
/* Check if the list of queries contains temporary objects */
if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
{
@@ -915,6 +922,7 @@ fmgr_sql_validator(PG_FUNCTION_ARGS)
ExecSetTempObjectIncluded();
}
#endif
+#endif
querytree_list = list_concat(querytree_list,
querytree_sublist);
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 993bc49c2a..f9ee56d5ab 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -3,6 +3,11 @@
* storage.c
* code to create and destroy physical storage for relations
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -106,6 +111,11 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence)
switch (relpersistence)
{
case RELPERSISTENCE_TEMP:
+#ifdef XCP
+ if (OidIsValid(MyCoordId))
+ backend = MyFirstBackendId;
+ else
+#endif
backend = MyBackendId;
needs_wal = false;
break;
diff --git a/src/backend/catalog/storm_catalog.sql b/src/backend/catalog/storm_catalog.sql
new file mode 100644
index 0000000000..47776ba00a
--- /dev/null
+++ b/src/backend/catalog/storm_catalog.sql
@@ -0,0 +1,307 @@
+CREATE VIEW storm_catalog.pg_roles AS
+ SELECT *
+ FROM pg_catalog.pg_roles
+ WHERE rolname = current_user
+ OR split_part(rolname, '@', 2) = current_database();
+
+GRANT SELECT on storm_catalog.pg_roles TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_roles FROM public;
+
+CREATE VIEW storm_catalog.pg_shdescription AS
+ SELECT d.objoid, d.classoid, d.description
+ FROM pg_catalog.pg_shdescription d, pg_catalog.pg_class c
+ WHERE d.classoid = c.oid
+ AND c.relname = 'pg_database'
+ AND d.objoid = (SELECT oid FROM pg_database WHERE datname = current_database())
+ UNION
+ SELECT d.objoid, d.classoid, d.description
+ FROM pg_catalog.pg_shdescription d, pg_catalog.pg_class c
+ WHERE d.classoid = c.oid
+ AND c.relname = 'pg_authid'
+ AND d.objoid = (SELECT oid FROM storm_catalog.pg_roles WHERE rolname = current_user);
+
+GRANT SELECT on storm_catalog.pg_shdescription TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_shdescription FROM public;
+
+CREATE VIEW storm_catalog.pg_database AS
+ SELECT tableoid, oid, datname, datdba, encoding, datcollate, datctype,
+ datistemplate, datallowconn, datconnlimit, datlastsysoid,
+ datfrozenxid, dattablespace, datacl
+ FROM pg_catalog.pg_database
+ WHERE datallowconn AND (has_database_privilege(datname, 'CREATE') OR
+ split_part(current_user, '@', 2) = datname);
+
+GRANT SELECT on storm_catalog.pg_database TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_database FROM public;
+
+CREATE VIEW storm_catalog.pg_db_role_setting AS
+ SELECT setdatabase, setrole, setconfig
+ FROM pg_catalog.pg_db_role_setting
+ WHERE setdatabase = (SELECT oid FROM pg_database WHERE datname = current_database())
+ UNION
+ SELECT setdatabase, setrole, setconfig
+ FROM pg_db_role_setting
+ WHERE setrole = (SELECT oid FROM storm_catalog.pg_roles WHERE rolname = current_user);
+
+GRANT SELECT on storm_catalog.pg_db_role_setting TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_db_role_setting FROM public;
+
+CREATE VIEW storm_catalog.pg_tablespace AS
+ SELECT oid, spcname, spcowner, ''::text as spclocation, ''::text as spcacl,
+ ''::text as spcoptions FROM pg_catalog.pg_tablespace;
+
+GRANT SELECT on storm_catalog.pg_tablespace TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_tablespace FROM public;
+
+CREATE VIEW storm_catalog.pg_auth_members AS
+ SELECT roleid, member, grantor, admin_option
+ FROM pg_catalog.pg_auth_members
+ WHERE roleid = (SELECT oid FROM storm_catalog.pg_roles WHERE rolname = current_user)
+ UNION
+ SELECT roleid, member, grantor, admin_option
+ FROM pg_catalog.pg_auth_members
+ WHERE grantor = (SELECT oid FROM storm_catalog.pg_roles WHERE rolname = current_user);
+
+GRANT SELECT on storm_catalog.pg_auth_members TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_auth_members FROM public;
+
+CREATE VIEW storm_catalog.pg_shdepend AS
+ SELECT dbid, classid, objid, objsubid, refclassid, refobjid, deptype
+ FROM pg_catalog.pg_shdepend
+ WHERE dbid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+GRANT SELECT on storm_catalog.pg_shdepend TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_shdepend FROM public;
+
+CREATE VIEW storm_catalog.pg_stat_database AS
+ SELECT *
+ FROM pg_catalog.pg_stat_database
+ WHERE datid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+GRANT SELECT on storm_catalog.pg_stat_database TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_stat_database FROM public;
+
+CREATE VIEW storm_catalog.pg_stat_database_conflicts AS
+ SELECT *
+ FROM pg_catalog.pg_stat_database_conflicts
+ WHERE datid = (SELECT oid FROM pg_database WHERE datname = current_database());
+
+GRANT SELECT on storm_catalog.pg_stat_database_conflicts TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_stat_database_conflicts FROM public;
+
+
+CREATE VIEW storm_catalog.pg_prepared_xacts AS
+ SELECT *
+ FROM pg_catalog.pg_prepared_xacts
+ WHERE database = current_database();
+
+GRANT SELECT on storm_catalog.pg_prepared_xacts TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_prepared_xacts FROM public;
+
+CREATE VIEW storm_catalog.pg_user AS
+ SELECT *
+ FROM pg_catalog.pg_user
+ WHERE usename = current_user
+ OR split_part(usename, '@', 2) = current_database();
+
+GRANT SELECT on storm_catalog.pg_user TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_user FROM public;
+
+CREATE VIEW storm_catalog.pg_group AS
+ SELECT *
+ FROM pg_catalog.pg_group
+ WHERE split_part(groname, '@', 2) = current_database();
+
+GRANT SELECT on storm_catalog.pg_group TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_group FROM public;
+
+CREATE VIEW storm_catalog.pg_shadow AS
+ SELECT *
+ FROM pg_catalog.pg_shadow
+ WHERE usename = current_user
+ OR split_part(usename, '@', 2) = current_database();
+
+GRANT SELECT on storm_catalog.pg_shadow TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_shadow FROM public;
+
+CREATE VIEW storm_catalog.pg_user_mappings AS
+ SELECT *
+ FROM pg_catalog.pg_user_mappings
+ WHERE usename = current_user
+ OR split_part(usename, '@', 2) = current_database();
+
+GRANT SELECT on storm_catalog.pg_user_mappings TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_user_mappings FROM public;
+
+REVOKE ALL on pg_catalog.pg_stat_bgwriter FROM public;
+
+REVOKE ALL on pg_catalog.pg_seclabels FROM public;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_conf_load_time() FROM PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_current_xlog_insert_location() FROM PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_current_xlog_location() FROM PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_is_in_recovery() FROM PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_last_xlog_receive_location() FROM PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_last_xlog_replay_location() FROM PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_postmaster_start_time() FROM PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_tablespace_databases(oid) FROM PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_tablespace_size(oid) FROM PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_tablespace_size(name) FROM PUBLIC;
+
+CREATE FUNCTION storm_catalog.pg_database_size(name) RETURNS bigint AS
+$BODY$
+BEGIN
+ IF $1 = current_database() THEN
+ return pg_catalog.pg_database_size($1);
+ END IF;
+
+ return 0;
+END
+$BODY$
+LANGUAGE 'plpgsql' ;
+
+GRANT EXECUTE on FUNCTION storm_catalog.pg_database_size(name) TO PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_database_size(name) FROM PUBLIC;
+
+CREATE OR REPLACE FUNCTION storm_catalog.pg_database_size(oid) RETURNS bigint AS
+$BODY$
+DECLARE
+ is_current_db boolean;
+BEGIN
+ SELECT $1 = oid
+ INTO is_current_db
+ FROM pg_catalog.pg_database
+ WHERE datname = current_database();
+
+ IF is_current_db THEN
+ return pg_catalog.pg_database_size($1);
+ END IF;
+
+ return 0;
+END
+$BODY$
+LANGUAGE 'plpgsql' ;
+
+GRANT EXECUTE on FUNCTION storm_catalog.pg_database_size(oid) TO PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_database_size(oid) FROM PUBLIC;
+
+CREATE FUNCTION storm_catalog.pg_show_all_settings(
+ OUT name text, OUT setting text, OUT unit text, OUT category text,
+ OUT short_desc text, OUT extra_desc text, OUT context text,
+ OUT vartype text, OUT source text, OUT min_val text, OUT max_val text,
+ OUT enumvals text[], OUT boot_val text, OUT reset_val text,
+ OUT sourcefile text, OUT sourceline integer)
+RETURNS SETOF record AS
+$BODY$
+BEGIN
+ RETURN QUERY
+ SELECT *
+ FROM pg_catalog.pg_show_all_settings() s
+ WHERE s.context != 'postmaster'
+ AND s.context != 'sighup';
+END
+$BODY$
+LANGUAGE 'plpgsql' SECURITY DEFINER;
+
+GRANT EXECUTE on FUNCTION storm_catalog.pg_show_all_settings() TO PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_show_all_settings() FROM PUBLIC;
+
+CREATE VIEW storm_catalog.pg_settings AS
+ SELECT *
+ FROM pg_show_all_settings();
+
+GRANT SELECT on storm_catalog.pg_settings TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_settings FROM public;
+
+CREATE FUNCTION storm_catalog.pg_stat_get_activity(
+ procpid integer, OUT datid oid, OUT pid integer, OUT usesysid oid,
+ OUT application_name text, OUT state text, OUT query text,
+ OUT waiting boolean, OUT xact_start timestamp with time zone,
+ OUT query_start timestamp with time zone,
+ OUT backend_start timestamp with time zone,
+ OUT state_change timestamp with time zone,
+ OUT client_addr inet,
+ OUT client_hostname text, OUT client_port integer)
+RETURNS SETOF record AS
+$BODY$
+BEGIN
+ RETURN QUERY
+ SELECT *
+ FROM pg_catalog.pg_stat_get_activity($1) s
+ WHERE s.datid = (SELECT oid
+ FROM pg_database
+ WHERE datname = current_database());
+END
+$BODY$
+LANGUAGE 'plpgsql' SECURITY DEFINER;
+
+GRANT EXECUTE on FUNCTION storm_catalog.pg_stat_get_activity(integer) TO PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_stat_get_activity(integer) FROM PUBLIC;
+
+CREATE VIEW storm_catalog.pg_stat_activity AS
+ SELECT *
+ FROM storm_catalog.pg_stat_get_activity(NULL);
+
+GRANT SELECT on storm_catalog.pg_stat_activity TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_stat_activity FROM public;
+
+CREATE FUNCTION storm_catalog.pg_lock_status(
+ OUT locktype text, OUT database oid, OUT relation oid,
+ OUT page integer, OUT tuple smallint, OUT virtualxid text,
+ OUT transactionid xid, OUT classid oid, OUT objid oid,
+ OUT objsubid smallint, OUT virtualtransaction text,
+ OUT pid integer, OUT mode text, OUT granted boolean,
+ OUT fastpath boolean)
+RETURNS SETOF record AS
+$BODY$
+BEGIN
+ RETURN QUERY
+ SELECT *
+ FROM pg_catalog.pg_lock_status() l
+ WHERE l.database = (SELECT oid
+ FROM pg_database
+ WHERE datname = current_database());
+END
+$BODY$
+LANGUAGE 'plpgsql' SECURITY DEFINER;
+
+GRANT EXECUTE on FUNCTION storm_catalog.pg_lock_status() TO PUBLIC;
+
+REVOKE ALL on FUNCTION pg_catalog.pg_lock_status() FROM PUBLIC;
+
+CREATE VIEW storm_catalog.pg_locks AS
+ SELECT *
+ FROM storm_catalog.pg_lock_status();
+
+GRANT SELECT on storm_catalog.pg_locks TO PUBLIC;
+
+REVOKE ALL on pg_catalog.pg_locks FROM public;
diff --git a/src/backend/commands/aggregatecmds.c b/src/backend/commands/aggregatecmds.c
index ddf029e857..9b7b05bfe4 100644
--- a/src/backend/commands/aggregatecmds.c
+++ b/src/backend/commands/aggregatecmds.c
@@ -4,6 +4,11 @@
*
* Routines for aggregate-manipulation commands
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -56,6 +61,9 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters)
List *sortoperatorName = NIL;
TypeName *baseType = NULL;
TypeName *transType = NULL;
+#ifdef XCP
+ TypeName *collectType = NULL;
+#endif
char *initval = NULL;
#ifdef PGXC
List *collectfuncName = NIL;
@@ -64,6 +72,9 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters)
Oid *aggArgTypes;
int numArgs;
Oid transTypeId;
+#ifdef XCP
+ Oid collectTypeId;
+#endif
ListCell *pl;
/* Convert list of names to a name and namespace */
@@ -97,6 +108,10 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters)
transType = defGetTypeName(defel);
else if (pg_strcasecmp(defel->defname, "stype1") == 0)
transType = defGetTypeName(defel);
+#ifdef XCP
+ else if (pg_strcasecmp(defel->defname, "ctype") == 0)
+ collectType = defGetTypeName(defel);
+#endif
else if (pg_strcasecmp(defel->defname, "initcond") == 0)
initval = defGetString(defel);
else if (pg_strcasecmp(defel->defname, "initcond1") == 0)
@@ -126,6 +141,17 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters)
(errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
errmsg("aggregate sfunc must be specified")));
+#ifdef XCP
+ if (collectfuncName && collectType == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("if aggregate cfunc is defined aggregate ctype must be specified")));
+ if (collectType && collectfuncName == NIL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("if aggregate ctype is defined aggregate cfunc must be specified")));
+#endif
+
/*
* look up the aggregate's input datatype(s).
*/
@@ -202,6 +228,31 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters)
format_type_be(transTypeId))));
}
+#ifdef XCP
+ /*
+ * look up the aggregate's collecttype.
+ *
+ * to the collecttype applied all the limitations as to the transtype.
+ */
+ if (collectType)
+ {
+ collectTypeId = typenameTypeId(NULL, collectType);
+ if (get_typtype(collectTypeId) == TYPTYPE_PSEUDO &&
+ !IsPolymorphicType(collectTypeId))
+ {
+ if (collectTypeId == INTERNALOID && superuser())
+ /* okay */ ;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("aggregate collection data type cannot be %s",
+ format_type_be(collectTypeId))));
+ }
+ }
+ else
+ collectTypeId = InvalidOid;
+#endif
+
/*
* Most of the argument-checking is done inside of AggregateCreate
*/
@@ -216,6 +267,9 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters)
finalfuncName, /* final function name */
sortoperatorName, /* sort operator name */
transTypeId, /* transition data type */
+#ifdef XCP
+ collectTypeId, /* collection data type */
+#endif
#ifdef PGXC
initval, /* initial condition */
initcollect); /* initial condition for collection function */
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 9612a276f3..0a88c4ea4d 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -3,6 +3,11 @@
* analyze.c
* the Postgres statistics generator
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -53,6 +58,14 @@
#include "utils/timestamp.h"
#include "utils/tqual.h"
+#ifdef XCP
+#include "catalog/pg_operator.h"
+#include "nodes/makefuncs.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/pgxc.h"
+#include "pgxc/planner.h"
+#include "utils/snapmgr.h"
+#endif
/* Data structure for Algorithm S from Knuth 3.4.2 */
typedef struct
@@ -108,6 +121,10 @@ static void update_attstats(Oid relid, bool inh,
static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
+#ifdef XCP
+static void analyze_rel_coordinator(Relation onerel, bool inh, int attr_cnt,
+ VacAttrStats **vacattrstats);
+#endif
/*
* analyze_rel() -- analyze one relation
@@ -404,6 +421,31 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
attr_cnt = tcnt;
}
+#ifdef XCP
+ if (IS_PGXC_COORDINATOR && onerel->rd_locator_info)
+ {
+ /*
+ * Fetch attribute statistics from remote nodes.
+ */
+ analyze_rel_coordinator(onerel, inh, attr_cnt, vacattrstats);
+ /*
+ * If it is a VACUUM or doing inherited relation precise values for
+ * relpages and reltuples are set in other place. Otherwise request
+ * doing it now.
+ */
+ if (!inh && !(vacstmt->options & VACOPT_VACUUM))
+ vacuum_rel_coordinator(onerel);
+ /*
+ * Skip acquiring local stats. Coordinator does not store data of
+ * distributed tables.
+ */
+ nindexes = 0;
+ hasindex = false;
+ Irel = NULL;
+ goto cleanup;
+ }
+#endif
+
/*
* Open all indexes of the relation, and see if there are any analyzable
* columns in the indexes. We do not analyze index columns if there was
@@ -604,6 +646,12 @@ do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
}
}
+#ifdef XCP
+ /*
+ * Coordinator skips getting local stats of distributed table up to here
+ */
+cleanup:
+#endif
/*
* Report ANALYZE to the stats collector, too. However, if doing
* inherited stats we shouldn't report, because the stats collector only
@@ -2790,3 +2838,423 @@ compare_mcvs(const void *a, const void *b)
return da - db;
}
+
+
+#ifdef XCP
+static void
+analyze_rel_coordinator(Relation onerel, bool inh, int attr_cnt,
+ VacAttrStats **vacattrstats)
+{
+ char *nspname;
+ char *relname;
+ /* Fields to run query to read statistics from data nodes */
+ StringInfoData query;
+ EState *estate;
+ MemoryContext oldcontext;
+ RemoteQuery *step;
+ RemoteQueryState *node;
+ TupleTableSlot *result;
+ int i;
+ /* Number of data nodes from which attribute statistics are received. */
+ int *numnodes;
+
+ /* Get the relation identifier */
+ relname = RelationGetRelationName(onerel);
+ nspname = get_namespace_name(RelationGetNamespace(onerel));
+
+ elog(LOG, "Getting detailed statistics for %s.%s", nspname, relname);
+
+ /* Make up query string */
+ initStringInfo(&query);
+ /* Generic statistic fields */
+ appendStringInfoString(&query, "SELECT s.staattnum, "
+// assume the number of tuples approximately the same on all nodes
+// to build more precise statistics get this number
+// "c.reltuples, "
+ "s.stanullfrac, "
+ "s.stawidth, "
+ "s.stadistinct");
+ /* Detailed statistic slots */
+ for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
+ appendStringInfo(&query, ", s.stakind%d"
+ ", o%d.oprname"
+ ", no%d.nspname"
+ ", t%dl.typname"
+ ", nt%dl.nspname"
+ ", t%dr.typname"
+ ", nt%dr.nspname"
+ ", s.stanumbers%d"
+ ", s.stavalues%d",
+ i, i, i, i, i, i, i, i, i);
+
+ /* Common part of FROM clause */
+ appendStringInfoString(&query, " FROM pg_statistic s JOIN pg_class c "
+ " ON s.starelid = c.oid "
+ "JOIN pg_namespace nc "
+ " ON c.relnamespace = nc.oid ");
+ /* Info about involved operations */
+ for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
+ appendStringInfo(&query, "LEFT JOIN (pg_operator o%d "
+ " JOIN pg_namespace no%d "
+ " ON o%d.oprnamespace = no%d.oid "
+ " JOIN pg_type t%dl "
+ " ON o%d.oprleft = t%dl.oid "
+ " JOIN pg_namespace nt%dl "
+ " ON t%dl.typnamespace = nt%dl.oid "
+ " JOIN pg_type t%dr "
+ " ON o%d.oprright = t%dr.oid "
+ " JOIN pg_namespace nt%dr "
+ " ON t%dr.typnamespace = nt%dr.oid) "
+ " ON s.staop%d = o%d.oid ",
+ i, i, i, i, i, i, i, i, i,
+ i, i, i, i, i, i, i, i, i);
+ appendStringInfo(&query, "WHERE nc.nspname = '%s' "
+ "AND c.relname = '%s'",
+ nspname, relname);
+
+ /* Build up RemoteQuery */
+ step = makeNode(RemoteQuery);
+ step->combine_type = COMBINE_TYPE_NONE;
+ step->exec_nodes = NULL;
+ step->sql_statement = query.data;
+ step->force_autocommit = true;
+ step->exec_type = EXEC_ON_DATANODES;
+
+ /* Add targetlist entries */
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(StatisticRelationId,
+ "pg_statistic",
+ "staattnum"));
+// step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+// make_relation_tle(RelationRelationId,
+// "pg_class",
+// "reltuples"));
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(StatisticRelationId,
+ "pg_statistic",
+ "stanullfrac"));
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(StatisticRelationId,
+ "pg_statistic",
+ "stawidth"));
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(StatisticRelationId,
+ "pg_statistic",
+ "stadistinct"));
+ for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
+ {
+ /* 16 characters would be enough */
+ char colname[16];
+
+ sprintf(colname, "stakind%d", i);
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(StatisticRelationId,
+ "pg_statistic",
+ colname));
+
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(OperatorRelationId,
+ "pg_operator",
+ "oprname"));
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(NamespaceRelationId,
+ "pg_namespace",
+ "nspname"));
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(TypeRelationId,
+ "pg_type",
+ "typname"));
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(NamespaceRelationId,
+ "pg_namespace",
+ "nspname"));
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(TypeRelationId,
+ "pg_type",
+ "typname"));
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(NamespaceRelationId,
+ "pg_namespace",
+ "nspname"));
+
+ sprintf(colname, "stanumbers%d", i);
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(StatisticRelationId,
+ "pg_statistic",
+ colname));
+
+ sprintf(colname, "stavalues%d", i);
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(StatisticRelationId,
+ "pg_statistic",
+ colname));
+ }
+ /* Execute query on the data nodes */
+ estate = CreateExecutorState();
+
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ estate->es_snapshot = GetActiveSnapshot();
+
+ node = ExecInitRemoteQuery(step, estate, 0);
+ MemoryContextSwitchTo(oldcontext);
+
+ /* get ready to combine results */
+ numnodes = (int *) palloc(attr_cnt * sizeof(int));
+ for (i = 0; i < attr_cnt; i++)
+ numnodes[i] = 0;
+
+ result = ExecRemoteQuery(node);
+ while (result != NULL && !TupIsNull(result))
+ {
+ Datum value;
+ bool isnull;
+ int colnum = 1;
+ int2 attnum;
+// float4 reltuples;
+ float4 nullfrac;
+ int4 width;
+ float4 distinct;
+ VacAttrStats *stats = NULL;
+
+
+ /* Process statistics from the data node */
+ value = slot_getattr(result, colnum++, &isnull); /* staattnum */
+ attnum = DatumGetInt16(value);
+ for (i = 0; i < attr_cnt; i++)
+ if (vacattrstats[i]->attr->attnum == attnum)
+ {
+ stats = vacattrstats[i];
+ stats->stats_valid = true;
+ numnodes[i]++;
+ break;
+ }
+
+// value = slot_getattr(result, colnum++, &isnull); /* reltuples */
+// reltuples = DatumGetFloat4(value);
+
+ if (stats)
+ {
+ value = slot_getattr(result, colnum++, &isnull); /* stanullfrac */
+ nullfrac = DatumGetFloat4(value);
+ stats->stanullfrac += nullfrac;
+
+ value = slot_getattr(result, colnum++, &isnull); /* stawidth */
+ width = DatumGetInt32(value);
+ stats->stawidth += width;
+
+ value = slot_getattr(result, colnum++, &isnull); /* stadistinct */
+ distinct = DatumGetFloat4(value);
+ stats->stadistinct += distinct;
+
+ /* Detailed statistics */
+ for (i = 1; i <= STATISTIC_NUM_SLOTS; i++)
+ {
+ int2 kind;
+ float4 *numbers;
+ Datum *values;
+ int nnumbers, nvalues;
+ int k;
+
+ value = slot_getattr(result, colnum++, &isnull); /* kind */
+ kind = DatumGetInt16(value);
+
+ if (kind == 0)
+ {
+ /*
+ * Empty slot - skip next 8 fields: 6 fields of the
+ * operation identifier and two data fields (numbers and
+ * values)
+ */
+ colnum += 8;
+ continue;
+ }
+ else
+ {
+ Oid oprid;
+
+ /* Get operator */
+ value = slot_getattr(result, colnum++, &isnull); /* oprname */
+ if (isnull)
+ {
+ /*
+ * Operator is not specified for that kind, skip remaining
+ * fields to lookup the operator
+ */
+ oprid = InvalidOid;
+ colnum += 5; /* skip operation nsp and types */
+ }
+ else
+ {
+ char *oprname;
+ char *oprnspname;
+ Oid ltypid, rtypid;
+ char *ltypname,
+ *rtypname;
+ char *ltypnspname,
+ *rtypnspname;
+ oprname = DatumGetCString(value);
+ value = slot_getattr(result, colnum++, &isnull); /* oprnspname */
+ oprnspname = DatumGetCString(value);
+ /* Get left operand data type */
+ value = slot_getattr(result, colnum++, &isnull); /* typname */
+ ltypname = DatumGetCString(value);
+ value = slot_getattr(result, colnum++, &isnull); /* typnspname */
+ ltypnspname = DatumGetCString(value);
+ ltypid = get_typname_typid(ltypname,
+ get_namespaceid(ltypnspname));
+ /* Get right operand data type */
+ value = slot_getattr(result, colnum++, &isnull); /* typname */
+ rtypname = DatumGetCString(value);
+ value = slot_getattr(result, colnum++, &isnull); /* typnspname */
+ rtypnspname = DatumGetCString(value);
+ rtypid = get_typname_typid(rtypname,
+ get_namespaceid(rtypnspname));
+ /* lookup operator */
+ oprid = get_operid(oprname, ltypid, rtypid,
+ get_namespaceid(oprnspname));
+ }
+ /*
+ * Look up a statistics slot. If there is an entry of the
+ * same kind already, leave it, assuming the statistics
+ * is approximately the same on all nodes, so values from
+ * one node are representing entire relation well.
+ * If empty slot is found store values here. If no more
+ * slots skip remaining values.
+ */
+ for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
+ {
+ if (stats->stakind[k] == 0 ||
+ (stats->stakind[k] == kind && stats->staop[k] == oprid))
+ break;
+ }
+
+ if (k >= STATISTIC_NUM_SLOTS)
+ {
+ /* No empty slots */
+ break;
+ }
+
+ /*
+ * If it is an existing slot which has numbers or values
+ * continue to the next set. If slot exists but without
+ * numbers and values, try to acquire them now
+ */
+ if (stats->stakind[k] != 0 && (stats->numnumbers[k] > 0 ||
+ stats->numvalues[k] > 0))
+ {
+ colnum += 2; /* skip numbers and values */
+ continue;
+ }
+
+ /*
+ * Initialize slot
+ */
+ stats->stakind[k] = kind;
+ stats->staop[k] = oprid;
+ stats->numnumbers[k] = 0;
+ stats->stanumbers[k] = NULL;
+ stats->numvalues[k] = 0;
+ stats->stavalues[k] = NULL;
+ stats->statypid[k] = InvalidOid;
+ stats->statyplen[k] = -1;
+ stats->statypalign[k] = 'i';
+ stats->statypbyval[k] = true;
+ }
+
+
+ /* get numbers */
+ value = slot_getattr(result, colnum++, &isnull); /* numbers */
+ if (!isnull)
+ {
+ ArrayType *arry = DatumGetArrayTypeP(value);
+
+ /*
+ * We expect the array to be a 1-D float4 array; verify that. We don't
+ * need to use deconstruct_array() since the array data is just going
+ * to look like a C array of float4 values.
+ */
+ nnumbers = ARR_DIMS(arry)[0];
+ if (ARR_NDIM(arry) != 1 || nnumbers <= 0 ||
+ ARR_HASNULL(arry) ||
+ ARR_ELEMTYPE(arry) != FLOAT4OID)
+ elog(ERROR, "stanumbers is not a 1-D float4 array");
+ numbers = (float4 *) palloc(nnumbers * sizeof(float4));
+ memcpy(numbers, ARR_DATA_PTR(arry),
+ nnumbers * sizeof(float4));
+
+ /*
+ * Free arry if it's a detoasted copy.
+ */
+ if ((Pointer) arry != DatumGetPointer(value))
+ pfree(arry);
+
+ stats->numnumbers[k] = nnumbers;
+ stats->stanumbers[k] = numbers;
+ }
+ /* get values */
+ value = slot_getattr(result, colnum++, &isnull); /* values */
+ if (!isnull)
+ {
+ int j;
+ ArrayType *arry;
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ arry = DatumGetArrayTypeP(value);
+ /* We could cache this data, but not clear it's worth it */
+ get_typlenbyvalalign(ARR_ELEMTYPE(arry),
+ &elmlen, &elmbyval, &elmalign);
+ /* Deconstruct array into Datum elements; NULLs not expected */
+ deconstruct_array(arry,
+ ARR_ELEMTYPE(arry),
+ elmlen, elmbyval, elmalign,
+ &values, NULL, &nvalues);
+
+ /*
+ * If the element type is pass-by-reference, we now have a bunch of
+ * Datums that are pointers into the syscache value. Copy them to
+ * avoid problems if syscache decides to drop the entry.
+ */
+ if (!elmbyval)
+ {
+ for (j = 0; j < nvalues; j++)
+ values[j] = datumCopy(values[j], elmbyval, elmlen);
+ }
+
+ /*
+ * Free statarray if it's a detoasted copy.
+ */
+ if ((Pointer) arry != DatumGetPointer(value))
+ pfree(arry);
+
+ stats->numvalues[k] = nvalues;
+ stats->stavalues[k] = values;
+ /* store details about values data type */
+ stats->statypid[k] = ARR_ELEMTYPE(arry);
+ stats->statyplen[k] = elmlen;
+ stats->statypalign[k] = elmalign;
+ stats->statypbyval[k] = elmbyval;
+ }
+ }
+ }
+
+ /* fetch next */
+ result = ExecRemoteQuery(node);
+ }
+ ExecEndRemoteQuery(node);
+
+ for (i = 0; i < attr_cnt; i++)
+ {
+ VacAttrStats *stats = vacattrstats[i];
+
+ if (numnodes[i] > 0)
+ {
+ stats->stanullfrac /= numnodes[i];
+ stats->stawidth /= numnodes[i];
+ stats->stadistinct /= numnodes[i];
+ }
+ }
+ update_attstats(RelationGetRelid(onerel), inh, attr_cnt, vacattrstats);
+}
+#endif
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 0944131313..eee79ce74a 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -3,6 +3,11 @@
* copy.c
* Implements the COPY utility command
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -25,6 +30,10 @@
#include "access/xact.h"
#include "catalog/namespace.h"
#include "catalog/pg_type.h"
+#ifdef XCP
+#include "catalog/dependency.h"
+#include "commands/sequence.h"
+#endif
#include "commands/copy.h"
#include "commands/defrem.h"
#include "commands/trigger.h"
@@ -37,13 +46,13 @@
#include "optimizer/planner.h"
#include "parser/parse_relation.h"
#ifdef PGXC
-#include "optimizer/pgxcship.h"
#include "pgxc/pgxc.h"
#include "pgxc/execRemote.h"
#include "pgxc/locator.h"
#include "pgxc/remotecopy.h"
#include "nodes/nodes.h"
#include "pgxc/poolmgr.h"
+#include "pgxc/postgresql_fdw.h"
#include "catalog/pgxc_node.h"
#endif
#include "rewrite/rewriteHandler.h"
@@ -783,6 +792,9 @@ DoCopy(const CopyStmt *stmt, const char *queryString)
bool pipe = (stmt->filename == NULL);
Relation rel;
uint64 processed;
+#ifdef XCP
+ int oldSeqRangeVal = SequenceRangeVal;
+#endif
/* Disallow file COPY except to superusers. */
if (!pipe && !superuser())
@@ -813,10 +825,12 @@ DoCopy(const CopyStmt *stmt, const char *queryString)
rte->requiredPerms = required_access;
#ifdef PGXC
+#ifndef XCP
/* In case COPY is used on a temporary table, never use 2PC for implicit commits */
if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
ExecSetTempObjectIncluded();
#endif
+#endif
tupDesc = RelationGetDescr(rel);
attnums = CopyGetAttnums(tupDesc, rel, stmt->attlist);
@@ -839,6 +853,26 @@ DoCopy(const CopyStmt *stmt, const char *queryString)
rel = NULL;
}
+#ifdef XCP
+ /*
+ * The COPY might involve sequences. We want to cache a range of
+ * sequence values to avoid contacting the GTM repeatedly. This
+ * improves the COPY performance by quite a margin. We set the
+ * SequenceRangeVal GUC parameter to bring about this effect.
+ * Note that we could have checked the attribute list to ascertain
+ * if this GUC is really needed or not. However since this GUC
+ * only affects nextval calculations, if sequences are not present
+ * no harm is done..
+ *
+ * The user might have set the GUC value himself. Honor that if so
+ */
+
+#define MAX_CACHEVAL 1024
+ if (rel && getOwnedSequences(RelationGetRelid(rel)) != NIL &&
+ SequenceRangeVal == DEFAULT_CACHEVAL)
+ SequenceRangeVal = MAX_CACHEVAL;
+#endif
+
if (is_from)
{
Assert(rel);
@@ -850,6 +884,15 @@ DoCopy(const CopyStmt *stmt, const char *queryString)
cstate = BeginCopyFrom(rel, stmt->filename,
stmt->attlist, stmt->options);
processed = CopyFrom(cstate); /* copy from file to database */
+#ifdef XCP
+ /*
+ * We should record insert to distributed table.
+ * Bulk inserts into local tables are recorded when heap tuples are
+ * written.
+ */
+ if (IS_PGXC_COORDINATOR && rel->rd_locator_info)
+ pgstat_count_remote_insert(rel, (int) processed);
+#endif
EndCopyFrom(cstate);
}
else
@@ -860,6 +903,11 @@ DoCopy(const CopyStmt *stmt, const char *queryString)
EndCopyTo(cstate);
}
+#ifdef XCP
+ /* Set the SequenceRangeVal GUC to its earlier value */
+ SequenceRangeVal = oldSeqRangeVal;
+#endif
+
/*
* Close the relation. If reading, we can release the AccessShareLock we
* got; if writing, we should hold the lock until end of transaction to
@@ -1418,10 +1466,15 @@ BeginCopy(bool is_from,
*/
if (remoteCopyState && remoteCopyState->rel_loc)
{
+#ifdef XCP
+ DataNodeCopyBegin(remoteCopyState);
+ if (!remoteCopyState->locator)
+#else
remoteCopyState->connections = DataNodeCopyBegin(remoteCopyState->query_buf.data,
remoteCopyState->exec_nodes->nodeList,
GetActiveSnapshot());
if (!remoteCopyState->connections)
+#endif
ereport(ERROR,
(errcode(ERRCODE_CONNECTION_EXCEPTION),
errmsg("Failed to initialize Datanodes for COPY")));
@@ -1711,7 +1764,13 @@ CopyTo(CopyState cstate)
cstate->remoteCopyState &&
cstate->remoteCopyState->rel_loc)
{
- RemoteCopyData *remoteCopyState = cstate->remoteCopyState;
+ RemoteCopyData *rcstate = cstate->remoteCopyState;
+#ifdef XCP
+ processed = DataNodeCopyOut(
+ (PGXCNodeHandle **) getLocatorNodeMap(rcstate->locator),
+ getLocatorNodeCount(rcstate->locator),
+ cstate->copy_dest == COPY_FILE ? cstate->copy_file : NULL);
+#else
RemoteCopyType remoteCopyType;
/* Set up remote COPY to correct operation */
@@ -1732,6 +1791,7 @@ CopyTo(CopyState cstate)
cstate->copy_file,
NULL,
remoteCopyType);
+#endif
}
else
{
@@ -2193,7 +2253,28 @@ CopyFrom(CopyState cstate)
*/
if (IS_PGXC_COORDINATOR && cstate->remoteCopyState->rel_loc)
{
- Form_pg_attribute *attr = tupDesc->attrs;
+#ifdef XCP
+ Datum value = (Datum) 0;
+ bool isnull = true;
+ RemoteCopyData *rcstate = cstate->remoteCopyState;
+ AttrNumber dist_col = rcstate->rel_loc->partAttrNum;
+
+ if (AttributeNumberIsValid(dist_col))
+ {
+ value = values[dist_col-1];
+ isnull = nulls[dist_col-1];
+ }
+
+ if (DataNodeCopyIn(cstate->line_buf.data,
+ cstate->line_buf.len,
+ GET_NODES(rcstate->locator, value, isnull, NULL),
+ (PGXCNodeHandle**) getLocatorResults(rcstate->locator)))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_EXCEPTION),
+ errmsg("Copy failed on a data node")));
+ processed++;
+#else
+ Form_pg_attribute *attr = tupDesc->attrs;
Datum dist_col_value;
bool dist_col_is_null;
Oid dist_col_type;
@@ -2225,11 +2306,11 @@ CopyFrom(CopyState cstate)
(errcode(ERRCODE_CONNECTION_EXCEPTION),
errmsg("Copy failed on a Datanode")));
processed++;
+#endif
}
else
{
#endif
-
/* And now we can form the input tuple. */
tuple = heap_form_tuple(tupDesc, values, nulls);
@@ -2321,6 +2402,25 @@ CopyFrom(CopyState cstate)
resultRelInfo, myslot, bistate,
nBufferedTuples, bufferedTuples);
+#ifdef XCP
+ /*
+ * Now if line buffer contains some data that is an EOF marker. We should
+ * send it to all the participating datanodes
+ */
+ if (cstate->line_buf.len > 0)
+ {
+ RemoteCopyData *rcstate = cstate->remoteCopyState;
+ if (DataNodeCopyIn(cstate->line_buf.data,
+ cstate->line_buf.len,
+ getLocatorNodeCount(rcstate->locator),
+ (PGXCNodeHandle **) getLocatorNodeMap(rcstate->locator)))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_EXCEPTION),
+ errmsg("Copy failed on a data node")));
+
+ }
+#endif
+
/* Done, clean up */
error_context_stack = errcontext.previous;
@@ -2658,8 +2758,14 @@ BeginCopyFrom(Relation rel,
tmp = htonl(tmp);
appendBinaryStringInfo(&cstate->line_buf, (char *) &tmp, 4);
+#ifdef XCP
+ if (DataNodeCopyInBinaryForAll(cstate->line_buf.data, 19,
+ getLocatorNodeCount(remoteCopyState->locator),
+ (PGXCNodeHandle **) getLocatorNodeMap(remoteCopyState->locator)))
+#else
if (DataNodeCopyInBinaryForAll(cstate->line_buf.data, 19, remoteCopyState->connections))
- ereport(ERROR,
+#endif
+ ereport(ERROR,
(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
errmsg("invalid COPY file header (COPY SEND)")));
}
@@ -3105,11 +3211,16 @@ EndCopyFrom(CopyState cstate)
/* For PGXC related COPY, free also relation location data */
if (IS_PGXC_COORDINATOR && remoteCopyState->rel_loc)
{
+#ifdef XCP
+ DataNodeCopyFinish(getLocatorNodeCount(remoteCopyState->locator),
+ (PGXCNodeHandle **) getLocatorNodeMap(remoteCopyState->locator));
+#else
bool replicated = remoteCopyState->rel_loc->locatorType == LOCATOR_TYPE_REPLICATED;
DataNodeCopyFinish(
remoteCopyState->connections,
replicated ? PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE) : -1,
replicated ? COMBINE_TYPE_SAME : COMBINE_TYPE_SUM);
+#endif
FreeRemoteCopyData(remoteCopyState);
}
#endif
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 05240d581a..8759898686 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -91,10 +91,11 @@ static bool have_createdb_privilege(void);
static void remove_dbtablespaces(Oid db_id);
static bool check_db_file_conflict(Oid db_id);
static int errdetail_busy_db(int notherbackends, int npreparedxacts);
+#ifdef PGXC
static void createdb_xact_callback(bool isCommit, void *arg);
static void movedb_xact_callback(bool isCommit, void *arg);
static void movedb_success_callback(Oid db_id, Oid tblspcoid);
-
+#endif
/*
* CREATE DATABASE
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index a464090002..8c11d476f0 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3,6 +3,11 @@
* explain.c
* Explain query execution plans
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994-5, Regents of the University of California
*
@@ -779,6 +784,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
case T_ForeignScan:
pname = sname = "Foreign Scan";
break;
+#ifdef XCP
+ case T_RemoteSubplan:
+ pname = sname = "Remote Subquery Scan";
+ break;
+#endif /* XCP */
case T_Material:
pname = sname = "Materialize";
break;
@@ -809,6 +819,21 @@ ExplainNode(PlanState *planstate, List *ancestors,
strategy = "???";
break;
}
+#ifdef XCP
+ switch (((Agg *) plan)->aggstrategy)
+ {
+ case AGG_SLAVE:
+ operation = "Transition";
+ break;
+ case AGG_MASTER:
+ operation = "Collection";
+ break;
+ default:
+ operation = NULL;
+ break;
+ }
+#endif
+
break;
case T_WindowAgg:
pname = sname = "WindowAgg";
@@ -902,6 +927,66 @@ ExplainNode(PlanState *planstate, List *ancestors,
ExplainScanTarget((Scan *) plan, es);
break;
#endif
+#ifdef XCP
+ case T_RemoteSubplan:
+ {
+ RemoteSubplan *rsubplan = (RemoteSubplan *) plan;
+ List *nodeNameList = NIL;
+ ListCell *lc;
+
+ foreach(lc, rsubplan->nodeList)
+ {
+ char *nodename = get_pgxc_nodename(
+ PGXCNodeGetNodeOid(lfirst_int(lc),
+ PGXC_NODE_DATANODE));
+ nodeNameList = lappend(nodeNameList, nodename);
+ }
+
+ /* print out destination nodes */
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ {
+ if (nodeNameList)
+ {
+ if (es->nodes)
+ {
+ bool first = true;
+ ListCell *lc;
+ foreach(lc, nodeNameList)
+ {
+ char *nodename = (char *) lfirst(lc);
+ if (first)
+ {
+ appendStringInfo(es->str, " on %s (%s",
+ rsubplan->execOnAll ? "all" : "any",
+ nodename);
+ first = false;
+ }
+ else
+ appendStringInfo(es->str, ",%s", nodename);
+ }
+ appendStringInfoChar(es->str, ')');
+ }
+ else
+ {
+ appendStringInfo(es->str, " on %s",
+ rsubplan->execOnAll ? "all" : "any");
+ }
+ }
+ else
+ {
+ appendStringInfo(es->str, " on local node");
+ }
+ }
+ else
+ {
+ ExplainPropertyText("Replicated",
+ rsubplan->execOnAll ? "no" : "yes",
+ es);
+ ExplainPropertyList("Node List", nodeNameList, es);
+ }
+ }
+ break;
+#endif /* XCP */
case T_IndexScan:
{
IndexScan *indexscan = (IndexScan *) plan;
@@ -1130,6 +1215,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
"Index Cond", planstate, ancestors, es);
break;
#ifdef PGXC
+#ifndef XCP
case T_ModifyTable:
{
/* Remote query planning on DMLs */
@@ -1139,12 +1225,48 @@ ExplainNode(PlanState *planstate, List *ancestors,
ExplainRemoteQuery((RemoteQuery *) lfirst(elt), planstate, ancestors, es);
}
break;
+#endif
case T_RemoteQuery:
/* Remote query */
ExplainRemoteQuery((RemoteQuery *)plan, planstate, ancestors, es);
show_scan_qual(plan->qual, "Coordinator quals", planstate, ancestors, es);
break;
#endif
+#ifdef XCP
+ case T_RemoteSubplan:
+ {
+ RemoteSubplan *rsubplan = (RemoteSubplan *) plan;
+
+ /* print out destination nodes */
+ if (es->format == EXPLAIN_FORMAT_TEXT)
+ {
+ if (list_length(rsubplan->distributionNodes) > 0)
+ {
+ char label[24];
+ AttrNumber dkey = rsubplan->distributionKey;
+ sprintf(label, "Distribute results by %c",
+ rsubplan->distributionType);
+ if (dkey == InvalidAttrNumber)
+ {
+ appendStringInfoSpaces(es->str, es->indent * 2);
+ appendStringInfo(es->str, "%s\n", label);
+ }
+ else
+ {
+ TargetEntry *tle = NULL;
+ if (plan->targetlist)
+ tle = (TargetEntry *) list_nth(plan->targetlist,
+ dkey-1);
+ if (IsA(tle, TargetEntry))
+ show_expression((Node *) tle->expr, label,
+ planstate, ancestors,
+ false, es);
+ }
+ }
+ }
+ }
+ break;
+#endif
case T_BitmapHeapScan:
show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig,
"Recheck Cond", planstate, ancestors, es);
@@ -1922,7 +2044,7 @@ ExplainTargetRel(Plan *plan, Index rti, ExplainState *es)
case T_RemoteQuery:
/* get the object name from RTE itself */
Assert(rte->rtekind == RTE_REMOTE_DUMMY);
- objectname = rte->relname;
+ objectname = get_rel_name(rte->relid);
objecttag = "RemoteQuery name";
break;
default:
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 0628e20422..261c9705c6 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -3,6 +3,11 @@
* indexcmds.c
* POSTGRES define and remove index code.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -38,7 +43,6 @@
#include "parser/parse_func.h"
#include "parser/parse_oper.h"
#ifdef PGXC
-#include "optimizer/pgxcship.h"
#include "parser/parse_utilcmd.h"
#include "pgxc/pgxc.h"
#endif
@@ -545,6 +549,56 @@ DefineIndex(RangeVar *heapRelation,
(void) index_reloptions(amoptions, reloptions, true);
+#ifdef PGXC
+ /* Make sure we can locally enforce the index */
+ if (IS_PGXC_COORDINATOR && (primary || unique))
+ {
+ ListCell *elem;
+ bool isSafe = false;
+
+ foreach(elem, attributeList)
+ {
+ IndexElem *key = (IndexElem *) lfirst(elem);
+
+#ifdef XCP
+ if (rel->rd_locator_info == NULL)
+ {
+ isSafe = true;
+ break;
+ }
+#endif
+
+ if (CheckLocalIndexColumn(rel->rd_locator_info->locatorType,
+ rel->rd_locator_info->partAttrName, key->name))
+ {
+ isSafe = true;
+ break;
+ }
+ }
+ if (!isSafe)
+#ifdef XCP
+ {
+ if (loose_constraints)
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("Unique index of partitioned table must contain the hash/modulo distribution column.")));
+ /* create index still, just that it won't be unique */
+ unique = false;
+ isconstraint = false;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("Unique index of partitioned table must contain the hash/modulo distribution column.")));
+ }
+#else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("Unique index of partitioned table must contain the hash/modulo distribution column.")));
+#endif
+ }
+#endif
/*
* Prepare arguments for index_create, primarily an IndexInfo structure.
* Note that ii_Predicate must be in implicit-AND format.
@@ -575,37 +629,6 @@ DefineIndex(RangeVar *heapRelation,
accessMethodName, accessMethodId,
amcanorder, isconstraint);
-#ifdef PGXC
- /* Check if index is safely shippable */
- if (IS_PGXC_COORDINATOR)
- {
- List *indexAttrs = NIL;
-
- /* Prepare call for shippability evaluation */
- for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
- {
- /*
- * Expression attributes are set at 0, and do not make sense
- * when comparing them to distribution columns, so bypass.
- */
- if (indexInfo->ii_KeyAttrNumbers[i] > 0)
- indexAttrs = lappend_int(indexAttrs, indexInfo->ii_KeyAttrNumbers[i]);
- }
-
- /* Finalize check */
- if (!pgxc_check_index_shippability(GetRelationLocInfo(relationId),
- primary,
- unique,
- exclusionOpNames != NULL,
- indexAttrs,
- indexInfo->ii_Expressions))
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("Cannot create index whose evaluation cannot be "
- "enforced to remote nodes")));
-}
-#endif
-
/*
* Extra checks when creating a PRIMARY KEY index.
*/
diff --git a/src/backend/commands/portalcmds.c b/src/backend/commands/portalcmds.c
index 20476c3bf1..ad3a96b541 100644
--- a/src/backend/commands/portalcmds.c
+++ b/src/backend/commands/portalcmds.c
@@ -9,6 +9,11 @@
* storage management for portals (but doesn't run any queries in them).
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -269,6 +274,98 @@ PortalCleanup(Portal portal)
queryDesc = PortalGetQueryDesc(portal);
if (queryDesc)
{
+#ifdef XCP
+ if (portal->strategy == PORTAL_DISTRIBUTED)
+ {
+ /* If portal is producing it has an executor which should be
+ * shut down */
+ if (queryDesc->myindex == -1)
+ {
+ if (portal->status == PORTAL_FAILED)
+ {
+ /*
+ * Failed portal is not producing, we may remove it from the
+ * producers list.
+ */
+ removeProducingPortal(portal);
+ /* If cleanup fails below prevent double cleanup */
+ portal->queryDesc = NULL;
+ /*
+ * Inform consumers about failed producer if they are
+ * still waiting
+ */
+ if (queryDesc->squeue)
+ SharedQueueReset(queryDesc->squeue, -1);
+ }
+ /* executor may be finished already, if so estate will be null */
+ if (queryDesc->estate)
+ {
+ ResourceOwner saveResourceOwner;
+
+ /* We must make the portal's resource owner current to
+ * release resources properly */
+ saveResourceOwner = CurrentResourceOwner;
+ PG_TRY();
+ {
+ CurrentResourceOwner = portal->resowner;
+ /* Finish executor if it is not yet finished */
+ if (!queryDesc->estate->es_finished)
+ ExecutorFinish(queryDesc);
+ /* Destroy executor if not yet destroyed */
+ if (queryDesc->estate)
+ ExecutorEnd(queryDesc);
+ if (portal->status == PORTAL_FAILED)
+ {
+ /*
+ * If portal if failed we can allow to be blocked
+ * here while UnBind is waiting for finishing
+ * consumers.
+ */
+ if (queryDesc->squeue)
+ SharedQueueUnBind(queryDesc->squeue);
+ FreeQueryDesc(queryDesc);
+ }
+ }
+ PG_CATCH();
+ {
+ /* Ensure CurrentResourceOwner is restored on error */
+ CurrentResourceOwner = saveResourceOwner;
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+ CurrentResourceOwner = saveResourceOwner;
+ }
+ }
+ else
+ {
+ /* Cleaning up consumer */
+ ResourceOwner saveResourceOwner;
+
+ /* We must make the portal's resource owner current */
+ saveResourceOwner = CurrentResourceOwner;
+ PG_TRY();
+ {
+ CurrentResourceOwner = portal->resowner;
+ /* Prevent double cleanup in case of error below */
+ portal->queryDesc = NULL;
+ /* Reset the squeue if exists */
+ if (queryDesc->squeue)
+ SharedQueueReset(queryDesc->squeue, queryDesc->myindex);
+ FreeQueryDesc(queryDesc);
+ }
+ PG_CATCH();
+ {
+ /* Ensure CurrentResourceOwner is restored on error */
+ CurrentResourceOwner = saveResourceOwner;
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+ CurrentResourceOwner = saveResourceOwner;
+ }
+ }
+ else
+ {
+#endif
/*
* Reset the queryDesc before anything else. This prevents us from
* trying to shut down the executor twice, in case of an error below.
@@ -299,6 +396,9 @@ PortalCleanup(Portal portal)
PG_END_TRY();
CurrentResourceOwner = saveResourceOwner;
}
+#ifdef XCP
+ }
+#endif
}
}
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index 41b00ba1f0..219608b571 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -503,7 +503,7 @@ SetRemoteStatementName(Plan *plan, const char *stmt_name, int num_params,
char name[NAMEDATALEN];
/* Nothing to do if parameters are already set for this query */
- if (remotequery->rq_num_params != 0)
+ if (remotequery->remote_num_params != 0)
return 0;
if (stmt_name)
@@ -546,8 +546,8 @@ SetRemoteStatementName(Plan *plan, const char *stmt_name, int num_params,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Passing parameters in PREPARE statement is not supported")));
- remotequery->rq_num_params = num_params;
- remotequery->rq_param_types = param_types;
+ remotequery->remote_num_params = num_params;
+ remotequery->remote_param_types = param_types;
}
else if (IsA(plan, ModifyTable))
{
diff --git a/src/backend/commands/schemacmds.c b/src/backend/commands/schemacmds.c
index 6cc7cee3cf..ea00e171c1 100644
--- a/src/backend/commands/schemacmds.c
+++ b/src/backend/commands/schemacmds.c
@@ -3,6 +3,11 @@
* schemacmds.c
* schema creation/manipulation commands
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -32,7 +37,7 @@
#ifdef PGXC
#include "pgxc/pgxc.h"
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
#endif
static void AlterSchemaOwner_internal(HeapTuple tup, Relation rel, Oid newOwnerId);
@@ -132,9 +137,14 @@ CreateSchemaCommand(CreateSchemaStmt *stmt, const char *queryString)
* if not done already.
*/
if (!sentToRemote)
+#ifdef XCP
+ parsetree_list = AddRemoteQueryNode(parsetree_list, queryString,
+ EXEC_ON_ALL_NODES);
+#else
parsetree_list = AddRemoteQueryNode(parsetree_list, queryString,
EXEC_ON_ALL_NODES, false);
#endif
+#endif
/*
* Execute each command contained in the CREATE SCHEMA. Since the grammar
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 7e66ac99c7..3835dd92a3 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -3,6 +3,11 @@
* sequence.c
* PostgreSQL sequences support code.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -41,6 +46,9 @@
/* PGXC_COORD */
#include "access/gtm.h"
#include "utils/memutils.h"
+#ifdef XCP
+#include "utils/timestamp.h"
+#endif
#endif
/*
@@ -55,6 +63,12 @@
*/
#define SEQ_MAGIC 0x1717
+/* Configuration options */
+#ifdef XCP
+
+int SequenceRangeVal = 1;
+#endif
+
typedef struct sequence_magic
{
uint32 magic;
@@ -82,6 +96,10 @@ typedef struct SeqTableData
/* if last != cached, we have not used up all the cached values */
int64 increment; /* copy of sequence's increment field */
/* note that increment is zero until we first do read_info() */
+#ifdef XCP
+ TimestampTz last_call_time; /* the time when the last call as made */
+ int64 range_multiplier; /* multiply this value with 2 next time */
+#endif
} SeqTableData;
typedef SeqTableData *SeqTable;
@@ -125,7 +143,7 @@ static void init_params(List *options, bool isInit,
Form_pg_sequence new, List **owned_by, bool *is_restart);
#else
static void init_params(List *options, bool isInit,
- Form_pg_sequence new, List **owned_by);
+ Form_pg_sequence new, List **owned_by);
#endif
static void do_setval(Oid relid, int64 next, bool iscalled);
static void process_owned_by(Relation seqrel, List *owned_by);
@@ -562,7 +580,6 @@ AlterSequence(AlterSeqStmt *stmt)
/* Now okay to update the on-disk tuple */
memcpy(seq, &new, sizeof(FormData_pg_sequence));
-
#ifdef PGXC
increment = new.increment_by;
min_value = new.min_value;
@@ -731,19 +748,85 @@ nextval_internal(Oid relid)
page = BufferGetPage(buf);
#ifdef PGXC /* PGXC_COORD */
+#ifdef XCP
+ /* Allow nextval executed on datanodes */
+ if (!is_temp)
+#else
if (IS_PGXC_COORDINATOR && !is_temp)
+#endif
{
+#ifdef XCP
+ int64 range = seq->cache_value; /* how many values to ask from GTM? */
+ int64 rangemax; /* the max value returned from the GTM for our request */
+#endif
char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
/*
* Above, we still use the page as a locking mechanism to handle
* concurrency
*/
+#ifdef XCP
+ /*
+ * If the user has set a CACHE parameter, we use that. Else we pass in
+ * the SequenceRangeVal value
+ */
+ if (range == DEFAULT_CACHEVAL && SequenceRangeVal > range)
+ {
+ TimestampTz curtime = GetCurrentTimestamp();
+
+ if (!TimestampDifferenceExceeds(elm->last_call_time,
+ curtime, 1000))
+ {
+ /*
+ * The previous GetNextValGTM call was made just a while back.
+ * Request double the range of what was requested in the
+ * earlier call. Honor the SequenceRangeVal boundary
+ * value to limit very large range requests!
+ */
+ elm->range_multiplier *= 2;
+ if (elm->range_multiplier < SequenceRangeVal)
+ range = elm->range_multiplier;
+ else
+ elm->range_multiplier = range = SequenceRangeVal;
+
+ elog(DEBUG1, "increase sequence range %ld", range);
+ }
+ else if (TimestampDifferenceExceeds(elm->last_call_time,
+ curtime, 5000))
+ {
+ /* The previous GetNextValGTM call was pretty old */
+ range = elm->range_multiplier = DEFAULT_CACHEVAL;
+ elog(DEBUG1, "reset sequence range %ld", range);
+ }
+ else if (TimestampDifferenceExceeds(elm->last_call_time,
+ curtime, 3000))
+ {
+ /*
+ * The previous GetNextValGTM call was made quite some time
+ * ago. Try to reduce the range request to reduce the gap
+ */
+ if (elm->range_multiplier != DEFAULT_CACHEVAL)
+ {
+ range = elm->range_multiplier =
+ rint(elm->range_multiplier/2);
+ elog(DEBUG1, "decrease sequence range %ld", range);
+ }
+ }
+ else
+ {
+ /*
+ * Current range_multiplier alllows to cache sequence values
+ * for 1-3 seconds of work. Keep that rate.
+ */
+ range = elm->range_multiplier;
+ }
+ elm->last_call_time = curtime;
+ }
+
+ result = (int64) GetNextValGTM(seqname, range, &rangemax);
+#else
result = (int64) GetNextValGTM(seqname);
- if (result < 0)
- ereport(ERROR,
- (errcode(ERRCODE_CONNECTION_FAILURE),
- errmsg("GTM error, could not obtain sequence value")));
+#endif
pfree(seqname);
/* Update the on-disk data */
@@ -752,7 +835,11 @@ nextval_internal(Oid relid)
/* save info in local cache */
elm->last = result; /* last returned number */
+#ifdef XCP
+ elm->cached = rangemax; /* last fetched range max limit */
+#else
elm->cached = result; /* last fetched number */
+#endif
elm->last_valid = true;
last_used_seq = elm;
@@ -875,11 +962,11 @@ nextval_internal(Oid relid)
/* Temporary sequences can go through normal process */
if (is_temp)
{
-#endif
/*
* This part is not taken into account,
* result has been received from GTM
*/
+#endif
last = next;
if (rescnt == 1) /* if it's first result - */
result = next; /* it's what to return */
@@ -896,8 +983,8 @@ nextval_internal(Oid relid)
/* Temporary sequences go through normal process */
if (is_temp)
{
-#endif
/* Result has been received from GTM */
+#endif
/* save info in local cache */
elm->last = result; /* last returned number */
elm->cached = last; /* last fetched number */
@@ -978,13 +1065,47 @@ currval_oid(PG_FUNCTION_ARGS)
errmsg("permission denied for sequence %s",
RelationGetRelationName(seqrel))));
+#ifdef XCP
+ {
+ /*
+ * Always contact GTM for currval regardless of valid
+ * elm->last_valid value
+ */
+ {
+ char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
+ result = (int64) GetCurrentValGTM(seqname);
+ pfree(seqname);
+ }
+ }
+#else
if (!elm->last_valid)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("currval of sequence \"%s\" is not yet defined in this session",
RelationGetRelationName(seqrel))));
+#endif
+
+#ifndef XCP
+#ifdef PGXC
+ if (IS_PGXC_COORDINATOR &&
+ seqrel->rd_backend != MyBackendId)
+ {
+ char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
+ result = (int64) GetCurrentValGTM(seqname);
+ if (result < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_FAILURE),
+ errmsg("GTM error, could not obtain sequence value")));
+ pfree(seqname);
+ }
+ else {
+#endif
result = elm->last;
+#ifdef PGXC
+ }
+#endif
+#endif
relation_close(seqrel, NoLock);
PG_RETURN_INT64(result);
@@ -1086,7 +1207,12 @@ do_setval(Oid relid, int64 next, bool iscalled)
}
#ifdef PGXC
+#ifdef XCP
+ /* Allow to execute on datanodes */
+ if (!is_temp)
+#else
if (IS_PGXC_COORDINATOR && !is_temp)
+#endif
{
char *seqname = GetGlobalSeqName(seqrel, NULL, NULL);
@@ -1286,6 +1412,10 @@ init_sequence(Oid relid, SeqTable *p_elm, Relation *p_rel)
elm->lxid = InvalidLocalTransactionId;
elm->last_valid = false;
elm->last = elm->cached = elm->increment = 0;
+#ifdef XCP
+ elm->last_call_time = 0;
+ elm->range_multiplier = DEFAULT_CACHEVAL;
+#endif
elm->next = seqtab;
seqtab = elm;
}
@@ -1561,8 +1691,8 @@ init_params(List *options, bool isInit,
snprintf(bufm, sizeof(bufm), INT64_FORMAT, new->max_value);
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("START value (%s) cannot be greater than MAXVALUE (%s)",
- bufs, bufm)));
+ errmsg("START value (%s) cannot be greater than MAXVALUE (%s)",
+ bufs, bufm)));
}
/* RESTART [WITH] */
@@ -1595,8 +1725,8 @@ init_params(List *options, bool isInit,
snprintf(bufm, sizeof(bufm), INT64_FORMAT, new->min_value);
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("RESTART value (%s) cannot be less than MINVALUE (%s)",
- bufs, bufm)));
+ errmsg("RESTART value (%s) cannot be less than MINVALUE (%s)",
+ bufs, bufm)));
}
if (new->last_value > new->max_value)
{
@@ -1607,8 +1737,8 @@ init_params(List *options, bool isInit,
snprintf(bufm, sizeof(bufm), INT64_FORMAT, new->max_value);
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("RESTART value (%s) cannot be greater than MAXVALUE (%s)",
- bufs, bufm)));
+ errmsg("RESTART value (%s) cannot be greater than MAXVALUE (%s)",
+ bufs, bufm)));
}
/* CACHE */
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 25da892c7b..76f7a1858d 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -3,6 +3,11 @@
* tablecmds.c
* Commands for creating and altering table structures and settings
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -92,7 +97,6 @@
#include "catalog/pgxc_class.h"
#include "catalog/pgxc_node.h"
#include "commands/sequence.h"
-#include "optimizer/pgxcship.h"
#include "pgxc/execRemote.h"
#include "pgxc/redistrib.h"
#endif
@@ -672,8 +676,25 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId)
/*
* Add to pgxc_class.
* we need to do this after CommandCounterIncrement
- */
+ * Distribution info is to be added under the following conditions:
+ * 1. The create table command is being run on a coordinator
+ * 2. The create table command is being run in restore mode and
+ * the statement contains distribute by clause.
+ * While adding a new datanode to the cluster an existing dump
+ * that was taken from a datanode is used, and
+ * While adding a new coordinator to the cluster an exiting dump
+ * that was taken from a coordinator is used.
+ * The dump taken from a datanode does NOT contain any DISTRIBUTE BY
+ * clause. This fact is used here to make sure that when the
+ * DISTRIBUTE BY clause is missing in the statemnet the system
+ * should not try to find out the node list itself.
+ */
+#ifdef XCP
+ if ((IS_PGXC_COORDINATOR && stmt->distributeby) ||
+ (isRestoreMode && stmt->distributeby != NULL))
+#else
if (IS_PGXC_COORDINATOR && relkind == RELKIND_RELATION)
+#endif
{
AddRelationDistribution(relationId, stmt->distributeby,
stmt->subcluster, inheritOids, descriptor);
@@ -978,13 +999,8 @@ RangeVarCallbackForDropRelation(const RangeVar *rel, Oid relOid, Oid oldRelOid,
* internal to the group that's being truncated. Finally all the relations
* are truncated and reindexed.
*/
-#ifdef PGXC
-void
-ExecuteTruncate(TruncateStmt *stmt, const char *sql_statement)
-#else
void
ExecuteTruncate(TruncateStmt *stmt)
-#endif
{
List *rels = NIL;
List *relids = NIL;
@@ -995,6 +1011,14 @@ ExecuteTruncate(TruncateStmt *stmt)
SubTransactionId mySubid;
ListCell *cell;
+#ifdef PGXC
+ if (stmt->restart_seqs)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("PGXC does not support RESTART IDENTITY yet"),
+ errdetail("The feature is not supported currently")));
+#endif
+
/*
* Open, exclusive-lock, and check all the explicitly-specified relations
*/
@@ -1246,42 +1270,6 @@ ExecuteTruncate(TruncateStmt *stmt)
resultRelInfo++;
}
-#ifdef PGXC
- /*
- * In Postgres-XC, TRUNCATE needs to be launched to remote nodes before the
- * AFTER triggers are launched. This insures that the triggers are being fired
- * by correct events.
- */
- if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
- {
- bool is_temp = false;
- RemoteQuery *step = makeNode(RemoteQuery);
-
- foreach(cell, stmt->relations)
- {
- Oid relid;
- RangeVar *rel = (RangeVar *) lfirst(cell);
-
- relid = RangeVarGetRelid(rel, NoLock, false);
- if (IsTempTable(relid))
- {
- is_temp = true;
- break;
- }
- }
-
- step->combine_type = COMBINE_TYPE_SAME;
- step->exec_nodes = NULL;
- step->sql_statement = pstrdup(sql_statement);
- step->force_autocommit = false;
- step->exec_type = EXEC_ON_DATANODES;
- step->is_temp = is_temp;
- ExecRemoteUtility(step);
- pfree(step->sql_statement);
- pfree(step);
- }
-#endif
-
/* Handle queued AFTER triggers */
AfterTriggerEndQuery(estate);
@@ -6363,29 +6351,6 @@ ATAddForeignKeyConstraint(AlteredTableInfo *tab, Relation rel,
ffeqoperators[i] = ffeqop;
}
-#ifdef PGXC
- /* Check the shippability of this foreign key */
- if (IS_PGXC_COORDINATOR)
- {
- List *childRefs = NIL, *parentRefs = NIL;
-
- /* Prepare call for shippability check */
- for (i = 0; i < numfks; i++)
- childRefs = lappend_int(childRefs, fkattnum[i]);
- for (i = 0; i < numpks; i++)
- parentRefs = lappend_int(parentRefs, pkattnum[i]);
-
- /* Now check shippability for this foreign key */
- if (!pgxc_check_fk_shippability(GetRelationLocInfo(RelationGetRelid(pkrel)),
- GetRelationLocInfo(RelationGetRelid(rel)),
- parentRefs,
- childRefs))
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("Cannot create foreign key whose evaluation cannot be enforced to remote nodes")));
- }
-#endif
-
/*
* Record the FK constraint in pg_constraint.
*/
@@ -6470,6 +6435,15 @@ ATExecValidateConstraint(Relation rel, char *constrName, bool recurse,
Form_pg_constraint con = NULL;
bool found = false;
+#ifdef XCP
+ /*
+ * Do not validate distributed relations on Coordinator, let Datanode do
+ * that when executing the ALTER TABLE statement.
+ */
+ if (IS_PGXC_COORDINATOR && rel->rd_locator_info)
+ return;
+#endif
+
conrel = heap_open(ConstraintRelationId, RowExclusiveLock);
/*
@@ -10277,16 +10251,12 @@ ATCheckCmd(Relation rel, AlterTableCmd *cmd)
switch (cmd->subtype)
{
case AT_DropColumn:
- {
- AttrNumber attnum = get_attnum(RelationGetRelid(rel),
- cmd->name);
- /* Distribution column cannot be dropped */
- if (IsDistribColumn(RelationGetRelid(rel), attnum))
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ /* Distribution column cannot be dropped */
+ if (IsDistColumnForRelId(RelationGetRelid(rel), cmd->name))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Distribution column cannot be dropped")));
- break;
- }
+ break;
default:
break;
@@ -10308,6 +10278,9 @@ BuildRedistribCommands(Oid relid, List *subCmds)
Oid *new_oid_array; /* Modified list of Oids */
int new_num, i; /* Modified number of Oids */
ListCell *item;
+#ifdef XCP
+ char node_type = PGXC_NODE_DATANODE;
+#endif
/* Get necessary information about relation */
rel = relation_open(redistribState->relid, NoLock);
@@ -10373,49 +10346,18 @@ BuildRedistribCommands(Oid relid, List *subCmds)
/* Build relation node list for new locator info */
for (i = 0; i < new_num; i++)
+#ifdef XCP
+ newLocInfo->nodeList = lappend_int(newLocInfo->nodeList,
+ PGXCNodeGetNodeId(new_oid_array[i],
+ &node_type));
+#else
newLocInfo->nodeList = lappend_int(newLocInfo->nodeList,
PGXCNodeGetNodeId(new_oid_array[i],
PGXC_NODE_DATANODE));
-
+#endif
/* Build the command tree for table redistribution */
PGXCRedistribCreateCommandList(redistribState, newLocInfo);
- /*
- * Using the new locator info already available, check if constraints on
- * relation are compatible with the new distribution.
- */
- foreach(item, RelationGetIndexList(rel))
- {
- Oid indid = lfirst_oid(item);
- Relation indexRel = index_open(indid, AccessShareLock);
- List *indexColNums = NIL;
- int2vector colIds = indexRel->rd_index->indkey;
-
- /*
- * Prepare call to shippability check. Attributes set to 0 correspond
- * to index expressions and are evaluated internally, so they are not
- * appended in given list.
- */
- for (i = 0; i < colIds.dim1; i++)
- {
- if (colIds.values[i] > 0)
- indexColNums = lappend_int(indexColNums, colIds.values[i]);
- }
-
- if (!pgxc_check_index_shippability(newLocInfo,
- indexRel->rd_index->indisprimary,
- indexRel->rd_index->indisunique,
- indexRel->rd_index->indisexclusion,
- indexColNums,
- indexRel->rd_indexprs))
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("Cannot alter table to distribution incompatible "
- "with existing constraints")));
-
- index_close(indexRel, AccessShareLock);
- }
-
/* Clean up */
FreeRelationLocInfo(newLocInfo);
pfree(new_oid_array);
@@ -10619,10 +10561,10 @@ AlterTableNamespace(AlterObjectSchemaStmt *stmt)
if (IS_PGXC_COORDINATOR &&
!IsConnFromCoord() &&
rel->rd_rel->relkind == RELKIND_SEQUENCE &&
- !IsTempSequence(relid))
+ !IsTempSequence(RelationGetRelid(rel)))
{
char *seqname = GetGlobalSeqName(rel, NULL, NULL);
- char *newseqname = GetGlobalSeqName(rel, NULL, stmt->newschema);
+ char *newseqname = GetGlobalSeqName(rel, NULL, get_namespace_name(nspOid));
/* We also need to rename it on the GTM */
if (RenameSequenceGTM(seqname, newseqname) < 0)
@@ -10638,8 +10580,6 @@ AlterTableNamespace(AlterObjectSchemaStmt *stmt)
}
#endif
- /* close rel, but keep lock until commit */
- relation_close(rel, NoLock);
}
/*
@@ -10796,7 +10736,7 @@ AlterSeqNamespaces(Relation classRel, Relation rel,
!IsTempSequence(RelationGetRelid(seqRel)))
{
char *seqname = GetGlobalSeqName(seqRel, NULL, NULL);
- char *newseqname = GetGlobalSeqName(seqRel, NULL, newNspName);
+ char *newseqname = GetGlobalSeqName(seqRel, NULL, get_namespace_name(newNspOid));
/* We also need to rename it on the GTM */
if (RenameSequenceGTM(seqname, newseqname) < 0)
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 04e2cc1acb..b540ac07cd 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -3,6 +3,11 @@
* trigger.c
* PostgreSQL TRIGGERs support code.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -56,7 +61,6 @@
#include "utils/tqual.h"
#ifdef PGXC
#include "pgxc/pgxc.h"
-#include "optimizer/pgxcship.h"
#endif
@@ -90,10 +94,6 @@ static void AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
int event, bool row_trigger,
HeapTuple oldtup, HeapTuple newtup,
List *recheckIndexes, Bitmapset *modifiedCols);
-#ifdef PGXC
-static bool pgxc_is_trigger_shippable(Trigger *trigger);
-static bool pgxc_is_trigger_firable(Trigger *trigger);
-#endif
/*
@@ -1932,12 +1932,6 @@ ExecBSInsertTriggers(EState *estate, ResultRelInfo *relinfo)
NULL, NULL, NULL))
continue;
-#ifdef PGXC
- /* Fire the trigger if authorized */
- if (!pgxc_is_trigger_firable(trigger))
- continue;
-#endif
-
LocTriggerData.tg_trigger = trigger;
newtuple = ExecCallTriggerFunc(&LocTriggerData,
i,
@@ -1993,12 +1987,6 @@ ExecBRInsertTriggers(EState *estate, ResultRelInfo *relinfo,
NULL, NULL, newtuple))
continue;
-#ifdef PGXC
- /* Fire the trigger if authorized */
- if (!pgxc_is_trigger_firable(trigger))
- continue;
-#endif
-
LocTriggerData.tg_trigtuple = oldtuple = newtuple;
LocTriggerData.tg_trigtuplebuf = InvalidBuffer;
LocTriggerData.tg_trigger = trigger;
@@ -2074,12 +2062,6 @@ ExecIRInsertTriggers(EState *estate, ResultRelInfo *relinfo,
NULL, NULL, newtuple))
continue;
-#ifdef PGXC
- /* Fire the trigger if authorized */
- if (!pgxc_is_trigger_firable(trigger))
- continue;
-#endif
-
LocTriggerData.tg_trigtuple = oldtuple = newtuple;
LocTriggerData.tg_trigtuplebuf = InvalidBuffer;
LocTriggerData.tg_trigger = trigger;
@@ -2149,12 +2131,6 @@ ExecBSDeleteTriggers(EState *estate, ResultRelInfo *relinfo)
NULL, NULL, NULL))
continue;
-#ifdef PGXC
- /* Fire the trigger if authorized */
- if (!pgxc_is_trigger_firable(trigger))
- continue;
-#endif
-
LocTriggerData.tg_trigger = trigger;
newtuple = ExecCallTriggerFunc(&LocTriggerData,
i,
@@ -2217,12 +2193,6 @@ ExecBRDeleteTriggers(EState *estate, EPQState *epqstate,
NULL, trigtuple, NULL))
continue;
-#ifdef PGXC
- /* Fire the trigger if authorized */
- if (!pgxc_is_trigger_firable(trigger))
- continue;
-#endif
-
LocTriggerData.tg_trigtuple = trigtuple;
LocTriggerData.tg_trigtuplebuf = InvalidBuffer;
LocTriggerData.tg_trigger = trigger;
@@ -2290,12 +2260,6 @@ ExecIRDeleteTriggers(EState *estate, ResultRelInfo *relinfo,
NULL, trigtuple, NULL))
continue;
-#ifdef PGXC
- /* Fire the trigger if authorized */
- if (!pgxc_is_trigger_firable(trigger))
- continue;
-#endif
-
LocTriggerData.tg_trigtuple = trigtuple;
LocTriggerData.tg_trigtuplebuf = InvalidBuffer;
LocTriggerData.tg_trigger = trigger;
@@ -2351,12 +2315,6 @@ ExecBSUpdateTriggers(EState *estate, ResultRelInfo *relinfo)
modifiedCols, NULL, NULL))
continue;
-#ifdef PGXC
- /* Fire the trigger if authorized */
- if (!pgxc_is_trigger_firable(trigger))
- continue;
-#endif
-
LocTriggerData.tg_trigger = trigger;
newtuple = ExecCallTriggerFunc(&LocTriggerData,
i,
@@ -2441,12 +2399,6 @@ ExecBRUpdateTriggers(EState *estate, EPQState *epqstate,
modifiedCols, trigtuple, newtuple))
continue;
-#ifdef PGXC
- /* Fire the trigger if authorized */
- if (!pgxc_is_trigger_firable(trigger))
- continue;
-#endif
-
LocTriggerData.tg_trigtuple = trigtuple;
LocTriggerData.tg_newtuple = oldtuple = newtuple;
LocTriggerData.tg_trigtuplebuf = InvalidBuffer;
@@ -2534,12 +2486,6 @@ ExecIRUpdateTriggers(EState *estate, ResultRelInfo *relinfo,
NULL, trigtuple, newtuple))
continue;
-#ifdef PGXC
- /* Fire the trigger if authorized */
- if (!pgxc_is_trigger_firable(trigger))
- continue;
-#endif
-
LocTriggerData.tg_trigtuple = trigtuple;
LocTriggerData.tg_newtuple = oldtuple = newtuple;
LocTriggerData.tg_trigtuplebuf = InvalidBuffer;
@@ -2611,12 +2557,6 @@ ExecBSTruncateTriggers(EState *estate, ResultRelInfo *relinfo)
NULL, NULL, NULL))
continue;
-#ifdef PGXC
- /* Fire the trigger if authorized */
- if (!pgxc_is_trigger_firable(trigger))
- continue;
-#endif
-
LocTriggerData.tg_trigger = trigger;
newtuple = ExecCallTriggerFunc(&LocTriggerData,
i,
@@ -4354,7 +4294,15 @@ AfterTriggerSetState(ConstraintsSetStmt *stmt)
/*
* Not found ?
*/
+#ifdef XCP
+ /*
+ * Constraint exists where table exists, it's OK if constraint is
+ * not found on a data node. Silently ignore that.
+ */
+ if (!found && !IS_PGXC_DATANODE)
+#else
if (!found)
+#endif
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_OBJECT),
errmsg("constraint \"%s\" does not exist",
@@ -4683,12 +4631,6 @@ AfterTriggerSaveEvent(EState *estate, ResultRelInfo *relinfo,
modifiedCols, oldtup, newtup))
continue;
-#ifdef PGXC
- /* Fire the trigger if authorized */
- if (!pgxc_is_trigger_firable(trigger))
- continue;
-#endif
-
/*
* If this is an UPDATE of a PK table or FK table that does not change
* the PK or FK respectively, we can skip queuing the event: there is
@@ -4766,197 +4708,3 @@ pg_trigger_depth(PG_FUNCTION_ARGS)
{
PG_RETURN_INT32(MyTriggerDepth);
}
-
-
-#ifdef PGXC
-/*
- * pgxc_check_triggers_shippability
- * Check if given relation can be shipped entirely based on its potential
- * triggers actions. If at least one trigger is not shippable then the given
- * relation cannot be shipped completely to remote nodes for given command
- * type.
- */
-bool
-pgxc_check_triggers_shippability(Oid relid, CmdType commandType)
-{
- Relation rel = relation_open(relid, AccessShareLock);
- bool res = true;
- int i;
- TriggerDesc *trigdesc;
-
- /* Relation has no triggers, can safely return */
- if (!rel->rd_rel->relhastriggers)
- goto finish;
-
- /* Rebuild trigger list if necessary */
- if (rel->rd_rel->relhastriggers && rel->trigdesc == NULL)
- RelationBuildTriggers(rel);
-
- /* Definitely no triggers for this relation */
- if (rel->trigdesc == NULL)
- goto finish;
-
- trigdesc = rel->trigdesc;
-
- /*
- * Check if there are any triggers related to given command
- * If there are any, we need to scan the triggers to be sure
- * that they are safe.
- */
- switch (commandType)
- {
- case CMD_INSERT:
- if (!trigdesc->trig_insert_before_row &&
- !trigdesc->trig_insert_after_row &&
- !trigdesc->trig_insert_instead_row &&
- !trigdesc->trig_insert_before_statement &&
- !trigdesc->trig_insert_after_statement)
- goto finish;
- break;
- case CMD_UPDATE:
- if (!trigdesc->trig_update_before_row &&
- !trigdesc->trig_update_after_row &&
- !trigdesc->trig_update_instead_row &&
- !trigdesc->trig_update_before_statement &&
- !trigdesc->trig_update_after_statement)
- goto finish;
- break;
- case CMD_DELETE:
- if (!trigdesc->trig_delete_before_row &&
- !trigdesc->trig_delete_after_row &&
- !trigdesc->trig_delete_instead_row &&
- !trigdesc->trig_delete_before_statement &&
- !trigdesc->trig_delete_after_statement)
- goto finish;
- break;
- case CMD_UTILITY:
- /* Trigger might be based on an event */
- if (!trigdesc->trig_truncate_before_statement &&
- !trigdesc->trig_truncate_after_statement)
- goto finish;
- break;
- case CMD_SELECT:
- default:
- Assert(0); /* Shouldn't come here */
- }
-
- /*
- * By being here, it is sure that there are triggers on this relation
- * that are based on events based on the command type invocated.
- * So let's scan each potential trigger and be such that it is shippable.
- */
- for (i = 0; i < trigdesc->numtriggers; i++)
- {
- Trigger *trigger = &trigdesc->triggers[i];
- int16 tgtype = trigger->tgtype;
-
- switch (commandType)
- {
- case CMD_INSERT:
- /* Don't mind if trigger is not involved in INSERT */
- if (!TRIGGER_FOR_INSERT(tgtype))
- continue;
- break;
- case CMD_UPDATE:
- /* Don't mind if trigger is not involved in UPDATE */
- if (!TRIGGER_FOR_UPDATE(tgtype))
- continue;
- break;
- case CMD_DELETE:
- /* Don't mind if trigger is not involved in UPDATE */
- if (!TRIGGER_FOR_DELETE(tgtype))
- continue;
- break;
- /* Trigger might be on a truncate */
- case CMD_UTILITY:
- /* Don't mind if trigger is not involved in TRUNCATE */
- if (!TRIGGER_FOR_TRUNCATE(tgtype))
- continue;
- break;
- case CMD_SELECT:
- default:
- Assert(0); /* Shouldn't come here */
- continue;
- }
-
- /* Check trigger shippability */
- res = pgxc_is_trigger_shippable(trigger);
-
- /* Leave if trigger is not shippable */
- if (!res)
- goto finish;
- }
-
-finish:
- relation_close(rel, AccessShareLock);
- return res;
-}
-
-
-/*
- * pgxc_is_trigger_shippable
- * Depending on the node type where this trigger is evaluated and
- * its shippability, determine if the trigger can be fired or not.
- */
-static bool
-pgxc_is_trigger_firable(Trigger *trigger)
-{
- bool is_shippable = pgxc_is_trigger_shippable(trigger);
-
- /*
- * If trigger is based on a constraint or is internal, enforce its launch
- * whatever the node type where we are for the time being.
- * PGXCTODO: we need to remove this condition once constraints and triggers
- * are better implemented within Postgres-XC as a constraint can be locally
- * evaluated on remote nodes depending on the distribution type of the table
- * on which it is defined or on its parent/child distribution types.
- */
- if (trigger->tgisinternal)
- return true;
-
- /* A non-shippable trigger can be fired safely on a local Coordinator */
- if (!is_shippable && IS_PGXC_COORDINATOR && !IsConnFromCoord())
- return true;
-
- /* A shippable trigger can be fired safely on a remote node */
- if (is_shippable && IsConnFromCoord())
- return true;
-
- return false;
-}
-
-
-/*
- * pgxc_is_trigger_shippable
- * Check if trigger is shippable to a remote node
- */
-static bool
-pgxc_is_trigger_shippable(Trigger *trigger)
-{
- bool res = true;
-
- /*
- * If trigger is based on a constraint or is internal, enforce its launch
- * whatever the node type where we are for the time being.
- * PGXCTODO: we need to remove this condition once constraints and triggers
- * are better implemented within Postgres-XC as a constraint can be locally
- * evaluated on remote nodes depending on the distribution type of the table
- * on which it is defined or on its parent/child distribution types.
- */
- if (trigger->tgisinternal)
- return true;
-
- /*
- * INSTEAD OF triggers can only be defined on views, which are defined
- * only on Coordinators, so they cannot be shipped.
- */
- if (TRIGGER_FOR_INSTEAD(trigger->tgtype))
- res = false;
-
- /* Finally check if function called is shippable */
- if (!pgxc_is_func_shippable(trigger->tgfoid))
- res = false;
-
- return res;
-}
-#endif
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index f96e0700f8..e638d2898b 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -9,6 +9,11 @@
* in cluster.c.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -51,6 +56,13 @@
#ifdef PGXC
#include "pgxc/pgxc.h"
#endif
+#ifdef XCP
+#include "executor/executor.h"
+#include "nodes/makefuncs.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/planner.h"
+#include "utils/lsyscache.h"
+#endif /* XCP */
/*
* GUC parameters
@@ -70,7 +82,6 @@ static void vac_truncate_clog(TransactionId frozenXID);
static bool vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast,
bool for_wraparound);
-
/*
* Primary entry point for VACUUM and ANALYZE commands.
*
@@ -1091,6 +1102,17 @@ vacuum_rel(Oid relid, VacuumStmt *vacstmt, bool do_toast, bool for_wraparound)
save_sec_context | SECURITY_RESTRICTED_OPERATION);
save_nestlevel = NewGUCNestLevel();
+#ifdef XCP
+ /*
+ * If we are on coordinator and target relation is distributed, read
+ * the statistics from the data node instead of vacuuming local relation.
+ */
+ if (IS_PGXC_COORDINATOR && onerel->rd_locator_info)
+ {
+ vacuum_rel_coordinator(onerel);
+ }
+ else
+#endif
/*
* Do the actual work --- either FULL or "lazy" vacuum
*/
@@ -1231,3 +1253,281 @@ vacuum_delay_point(void)
CHECK_FOR_INTERRUPTS();
}
}
+
+#ifdef XCP
+/*
+ * For the data node query make up TargetEntry representing specified column
+ * of pg_class catalog table
+ */
+TargetEntry *
+make_relation_tle(Oid reloid, const char *relname, const char *column)
+{
+ HeapTuple tuple;
+ Var *var;
+ Form_pg_attribute att_tup;
+ TargetEntry *tle;
+
+ tuple = SearchSysCacheAttName(reloid, column);
+ if (!HeapTupleIsValid(tuple))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_COLUMN),
+ errmsg("column \"%s\" of relation \"%s\" does not exist",
+ column, relname)));
+ att_tup = (Form_pg_attribute) GETSTRUCT(tuple);
+
+ var = makeVar(1,
+ att_tup->attnum,
+ att_tup->atttypid,
+ att_tup->atttypmod,
+ InvalidOid,
+ 0);
+
+ tle = makeTargetEntry((Expr *) var, att_tup->attnum, NULL, false);
+ ReleaseSysCache(tuple);
+ return tle;
+}
+
+
+/*
+ * Get relation statistics from remote data nodes
+ * Returns number of nodes that returned correct statistics.
+ */
+static int
+get_remote_relstat(char *nspname, char *relname, bool replicated,
+ int32 *pages, float4 *tuples, TransactionId *frozenXid)
+{
+ StringInfoData query;
+ EState *estate;
+ MemoryContext oldcontext;
+ RemoteQuery *step;
+ RemoteQueryState *node;
+ TupleTableSlot *result;
+ int validpages,
+ validtuples,
+ validfrozenxids;
+
+ /* Make up query string */
+ initStringInfo(&query);
+ appendStringInfo(&query, "SELECT c.relpages, "
+ "c.reltuples, "
+ "c.relfrozenxid "
+ "FROM pg_class c JOIN pg_namespace n "
+ "ON c.relnamespace = n.oid "
+ "WHERE n.nspname = '%s' "
+ "AND c.relname = '%s'",
+ nspname, relname);
+
+ /* Build up RemoteQuery */
+ step = makeNode(RemoteQuery);
+
+ step->combine_type = COMBINE_TYPE_NONE;
+ step->exec_nodes = NULL;
+ step->sql_statement = query.data;
+ step->force_autocommit = true;
+ step->exec_type = EXEC_ON_DATANODES;
+
+ /* Add targetlist entries */
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(RelationRelationId,
+ "pg_class",
+ "relpages"));
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(RelationRelationId,
+ "pg_class",
+ "reltuples"));
+ step->scan.plan.targetlist = lappend(step->scan.plan.targetlist,
+ make_relation_tle(RelationRelationId,
+ "pg_class",
+ "relfrozenxid"));
+
+ /* Execute query on the data nodes */
+ estate = CreateExecutorState();
+
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+
+ estate->es_snapshot = GetActiveSnapshot();
+
+ node = ExecInitRemoteQuery(step, estate, 0);
+ MemoryContextSwitchTo(oldcontext);
+ /* get ready to combine results */
+ *pages = 0;
+ *tuples = 0.0;
+ *frozenXid = InvalidTransactionId;
+ validpages = 0;
+ validtuples = 0;
+ validfrozenxids = 0;
+ result = ExecRemoteQuery(node);
+ while (result != NULL && !TupIsNull(result))
+ {
+ Datum value;
+ bool isnull;
+ /* Process statistics from the data node */
+ value = slot_getattr(result, 1, &isnull); /* relpages */
+ if (!isnull)
+ {
+ validpages++;
+ *pages += DatumGetInt32(value);
+ }
+ value = slot_getattr(result, 2, &isnull); /* reltuples */
+ if (!isnull)
+ {
+ validtuples++;
+ *tuples += DatumGetFloat4(value);
+ }
+ value = slot_getattr(result, 3, &isnull); /* relfrozenxid */
+ if (!isnull)
+ {
+ /*
+ * relfrozenxid on coordinator should be the lowest one from the
+ * datanodes.
+ */
+ TransactionId xid = DatumGetTransactionId(value);
+ if (TransactionIdIsValid(xid))
+ {
+ validfrozenxids++;
+ if (!TransactionIdIsValid(*frozenXid) ||
+ TransactionIdPrecedes(xid, *frozenXid))
+ {
+ *frozenXid = xid;
+ }
+ }
+ }
+ /* fetch next */
+ result = ExecRemoteQuery(node);
+ }
+ ExecEndRemoteQuery(node);
+
+ if (replicated)
+ {
+ /*
+ * Normally numbers should be the same on the nodes, but relations
+ * are autovacuum'ed independedly, so they may differ.
+ * Average is good enough approximation in this case.
+ */
+ if (validpages > 0)
+ *pages /= validpages;
+
+ if (validtuples > 0)
+ *tuples /= validtuples;
+ }
+
+ if (validfrozenxids < validpages || validfrozenxids < validtuples)
+ {
+ /*
+ * If some node returned invalid value for frozenxid we can not set
+ * it on coordinator. There are other cases when returned value of
+ * frozenXid should be ignored, these cases are checked by caller.
+ * Basically, to be sure, there should be one value from each node,
+ * where the table is partitioned.
+ */
+ *frozenXid = InvalidTransactionId;
+ return Max(validpages, validtuples);
+ }
+ else
+ {
+ return validfrozenxids;
+ }
+}
+
+
+/*
+ * Coordinator does not contain any data, so we never need to vacuum relations.
+ * This function only updates optimizer statistics based on info from the
+ * data nodes.
+ */
+void
+vacuum_rel_coordinator(Relation onerel)
+{
+ char *nspname;
+ char *relname;
+ /* fields to combine relation statistics */
+ int32 num_pages;
+ float4 num_tuples;
+ TransactionId min_frozenxid;
+ bool hasindex;
+ bool replicated;
+ int rel_nodes;
+
+ /* Get the relation identifier */
+ relname = RelationGetRelationName(onerel);
+ nspname = get_namespace_name(RelationGetNamespace(onerel));
+
+ elog(LOG, "Getting relation statistics for %s.%s", nspname, relname);
+
+ replicated = IsLocatorReplicated(RelationGetLocatorType(onerel));
+ /*
+ * Get stats from the remote nodes. Function returns the number of nodes
+ * returning correct stats.
+ */
+ rel_nodes = get_remote_relstat(nspname, relname, replicated,
+ &num_pages, &num_tuples, &min_frozenxid);
+ if (rel_nodes > 0)
+ {
+ int nindexes;
+ Relation *Irel;
+ int nodes = list_length(RelationGetLocInfo(onerel)->nodeList);
+
+ vac_open_indexes(onerel, ShareUpdateExclusiveLock, &nindexes, &Irel);
+ hasindex = (nindexes > 0);
+
+ if (hasindex)
+ {
+ int i;
+
+ /* Fetch index stats */
+ for (i = 0; i < nindexes; i++)
+ {
+ int32 idx_pages;
+ float4 idx_tuples;
+ TransactionId idx_frozenxid;
+ int idx_nodes;
+
+ /* Get the index identifier */
+ relname = RelationGetRelationName(Irel[i]);
+ nspname = get_namespace_name(RelationGetNamespace(Irel[i]));
+ /* Index is replicated if parent relation is replicated */
+ idx_nodes = get_remote_relstat(nspname, relname, replicated,
+ &idx_pages, &idx_tuples, &idx_frozenxid);
+ if (idx_nodes > 0)
+ {
+ /*
+ * Do not update the frozenxid if information was not from
+ * all the expected nodes.
+ */
+ if (idx_nodes < nodes)
+ {
+ idx_frozenxid = InvalidTransactionId;
+ }
+ /* save changes */
+ vac_update_relstats(Irel[i],
+ (BlockNumber) idx_pages,
+ (double) idx_tuples,
+ 0,
+ false,
+ idx_frozenxid);
+ }
+ }
+ }
+
+ /* Done with indexes */
+ vac_close_indexes(nindexes, Irel, NoLock);
+
+ /*
+ * Do not update the frozenxid if information was not from all
+ * the expected nodes.
+ */
+ if (rel_nodes < nodes)
+ {
+ min_frozenxid = InvalidTransactionId;
+ }
+
+ /* save changes */
+ vac_update_relstats(onerel,
+ (BlockNumber) num_pages,
+ (double) num_tuples,
+ visibilitymap_count(onerel),
+ hasindex,
+ min_frozenxid);
+ }
+}
+#endif
diff --git a/src/backend/commands/variable.c b/src/backend/commands/variable.c
index 112703819e..ea2211a27b 100644
--- a/src/backend/commands/variable.c
+++ b/src/backend/commands/variable.c
@@ -4,6 +4,11 @@
* Routines for handling specialized SET variables.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -20,6 +25,9 @@
#include "access/xact.h"
#include "catalog/pg_authid.h"
+#ifdef XCP
+#include "catalog/pgxc_node.h"
+#endif
#include "commands/variable.h"
#include "miscadmin.h"
#include "utils/acl.h"
@@ -890,6 +898,126 @@ assign_session_authorization(const char *newval, void *extra)
}
+#ifdef XCP
+
+/*
+ * SET GLOBAL SESSION
+ */
+
+typedef struct
+{
+ /* This is the "extra" state for GLOBAL SESSION */
+ Oid coordid;
+ int coordpid;
+} global_session_extra;
+
+
+bool
+check_global_session(char **newval, void **extra, GucSource source)
+{
+ HeapTuple coordTup;
+ Oid coordid;
+ char *separatorPos;
+ int coordpid;
+ global_session_extra *myextra;
+
+ /* Do nothing for the boot_val default of NULL */
+ if (*newval == NULL)
+ return true;
+
+ if (strcmp(*newval, "none") == 0)
+ {
+ /* hardwired translation */
+ coordid = InvalidOid;
+ coordpid = 0;
+ }
+ else
+ {
+ if (!IsTransactionState())
+ {
+ /*
+ * Can't do catalog lookups, so fail. The result of this is that
+ * global_session cannot be set in postgresql.conf, which seems
+ * like a good thing anyway, so we don't work hard to avoid it.
+ */
+ return false;
+ }
+
+ /*
+ * Get pointer on '_' character separating coordinator name from pid in the
+ * global session identifier
+ */
+ separatorPos = strrchr(*newval, '_');
+ if (separatorPos == NULL)
+ {
+ GUC_check_errmsg("malformed Global Session identifier: \"%s\"", *newval);
+ return false;
+ }
+
+ /*
+ * The pid is written immediately after the separator
+ */
+ coordpid = atoi(separatorPos + 1);
+ if (coordpid <= 0)
+ {
+ GUC_check_errmsg("malformed Global Session identifier: \"%s\"", *newval);
+ return false;
+ }
+
+
+ /*
+ * Temporary truncate the Global Session identifier to extract session name
+ */
+ *separatorPos = '\0';
+ /* Look up the nodename */
+ coordTup = SearchSysCache1(PGXCNODENAME, PointerGetDatum(*newval));
+ if (!HeapTupleIsValid(coordTup))
+ {
+ *separatorPos = '_';
+ GUC_check_errmsg("node \"%s\" does not exist", *newval);
+ return false;
+ }
+
+ if (((Form_pgxc_node) GETSTRUCT(coordTup))->node_type != PGXC_NODE_COORDINATOR)
+ {
+ ReleaseSysCache(coordTup);
+ *separatorPos = '_';
+ GUC_check_errmsg("node \"%s\" is not a coordinator", *newval);
+ return false;
+ }
+
+ coordid = HeapTupleGetOid(coordTup);
+
+ *separatorPos = '_';
+ ReleaseSysCache(coordTup);
+ }
+
+ /* Set up "extra" struct for assign_session_authorization to use */
+ myextra = (global_session_extra *) malloc(sizeof(global_session_extra));
+ if (!myextra)
+ return false;
+ myextra->coordid = coordid;
+ myextra->coordpid = coordpid;
+ *extra = (void *) myextra;
+
+ return true;
+}
+
+
+void
+assign_global_session(const char *newval, void *extra)
+{
+ global_session_extra *myextra = (global_session_extra *) extra;
+
+ /* Do nothing for the boot_val default of NULL */
+ if (!myextra)
+ return;
+
+ SetGlobalSession(myextra->coordid, myextra->coordpid);
+}
+#endif
+
+
/*
* SET ROLE
*
diff --git a/src/backend/commands/view.c b/src/backend/commands/view.c
index 6ab0ce8345..cdd7c64870 100644
--- a/src/backend/commands/view.c
+++ b/src/backend/commands/view.c
@@ -3,6 +3,11 @@
* view.c
* use rewrite rules to construct views
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -511,6 +516,14 @@ DefineView(ViewStmt *stmt, const char *queryString)
if (view->relpersistence == RELPERSISTENCE_PERMANENT
&& isViewOnTempTable(viewParse))
{
+ view = copyObject(view); /* don't corrupt original command */
+#ifdef XCP
+ /*
+ * Change original command as well - we do not want to create that view
+ * on other coordinators where temp table does not exist
+ */
+ stmt->view->relpersistence = RELPERSISTENCE_TEMP;
+#endif
view->relpersistence = RELPERSISTENCE_TEMP;
ereport(NOTICE,
(errmsg("view \"%s\" will be a temporary view",
@@ -518,10 +531,12 @@ DefineView(ViewStmt *stmt, const char *queryString)
}
#ifdef PGXC
+#ifndef XCP
/* In case view is temporary, be sure not to use 2PC on such relations */
if (view->relpersistence == RELPERSISTENCE_TEMP)
ExecSetTempObjectIncluded();
#endif
+#endif
/*
* Create the view relation
diff --git a/src/backend/executor/Makefile b/src/backend/executor/Makefile
index 6081b56c08..ffe97a90b2 100644
--- a/src/backend/executor/Makefile
+++ b/src/backend/executor/Makefile
@@ -24,6 +24,6 @@ OBJS = execAmi.o execCurrent.o execGrouping.o execJunk.o execMain.o \
nodeSeqscan.o nodeSetOp.o nodeSort.o nodeUnique.o \
nodeValuesscan.o nodeCtescan.o nodeWorktablescan.o \
nodeGroup.o nodeSubplan.o nodeSubqueryscan.o nodeTidscan.o \
- nodeForeignscan.o nodeWindowAgg.o tstoreReceiver.o spi.o
+ nodeForeignscan.o nodeWindowAgg.o producerReceiver.o tstoreReceiver.o spi.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c
index 462d137e29..4008f39582 100644
--- a/src/backend/executor/execAmi.c
+++ b/src/backend/executor/execAmi.c
@@ -3,6 +3,11 @@
* execAmi.c
* miscellaneous executor access method routines
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -200,10 +205,16 @@ ExecReScan(PlanState *node)
break;
#ifdef PGXC
+#ifdef XCP
+ case T_RemoteSubplanState:
+ ExecReScanRemoteSubplan((RemoteSubplanState *) node);
+ break;
+#else
case T_RemoteQueryState:
ExecRemoteQueryReScan((RemoteQueryState *) node, node->ps_ExprContext);
break;
#endif
+#endif
case T_NestLoopState:
ExecReScanNestLoop((NestLoopState *) node);
break;
diff --git a/src/backend/executor/execCurrent.c b/src/backend/executor/execCurrent.c
index 9eaa4710d5..e1a140b7dd 100644
--- a/src/backend/executor/execCurrent.c
+++ b/src/backend/executor/execCurrent.c
@@ -3,6 +3,11 @@
* execCurrent.c
* executor support for WHERE CURRENT OF cursor
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -268,6 +273,7 @@ search_plan_tree(PlanState *node, Oid table_oid)
switch (nodeTag(node))
{
#ifdef PGXC
+#ifndef XCP
case T_RemoteQueryState:
{
RemoteQueryState *rqs = (RemoteQueryState *) node;
@@ -275,6 +281,7 @@ search_plan_tree(PlanState *node, Oid table_oid)
return sstate;
}
#endif
+#endif
/*
* scan nodes can all be treated alike
*/
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 3a95dd109b..1283b39e89 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -26,6 +26,11 @@
* before ExecutorEnd. This can be omitted only in case of EXPLAIN,
* which should also omit ExecutorRun.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -59,6 +64,11 @@
#include "pgxc/pgxc.h"
#include "commands/copy.h"
#endif
+#ifdef XCP
+#include "access/gtm.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/poolmgr.h"
+#endif
/* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */
ExecutorStart_hook_type ExecutorStart_hook = NULL;
@@ -153,8 +163,39 @@ standard_ExecutorStart(QueryDesc *queryDesc, int eflags)
estate->es_param_list_info = queryDesc->params;
if (queryDesc->plannedstmt->nParamExec > 0)
+#ifdef XCP
+ {
+ estate->es_param_exec_vals = (ParamExecData *)
+ palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData));
+ if (queryDesc->plannedstmt->nParamRemote > 0)
+ {
+ ParamListInfo extparams = estate->es_param_list_info;
+ int i = queryDesc->plannedstmt->nParamRemote;
+ while (--i >= 0 &&
+ queryDesc->plannedstmt->remoteparams[i].paramkind == PARAM_EXEC)
+ {
+ int paramno = queryDesc->plannedstmt->remoteparams[i].paramid;
+ ParamExecData *prmdata;
+
+ Assert(paramno >= 0 &&
+ paramno < queryDesc->plannedstmt->nParamExec);
+ prmdata = &(estate->es_param_exec_vals[paramno]);
+ prmdata->value = extparams->params[i].value;
+ prmdata->isnull = extparams->params[i].isnull;
+ prmdata->ptype = extparams->params[i].ptype;
+ }
+ /*
+ * Truncate exec parameters from the list of received parameters
+ * to avoid sending down duplicates if there are multiple levels
+ * of RemoteSubplan statements
+ */
+ extparams->numParams = i + 1;
+ }
+ }
+#else
estate->es_param_exec_vals = (ParamExecData *)
palloc0(queryDesc->plannedstmt->nParamExec * sizeof(ParamExecData));
+#endif
/*
* If non-read-only query, set the command ID to mark output tuples with
@@ -766,8 +807,10 @@ InitPlan(QueryDesc *queryDesc, int eflags)
/* es_result_relation_info is NULL except when within ModifyTable */
estate->es_result_relation_info = NULL;
#ifdef PGXC
+#ifndef XCP
estate->es_result_remoterel = NULL;
#endif
+#endif
}
else
{
@@ -778,7 +821,9 @@ InitPlan(QueryDesc *queryDesc, int eflags)
estate->es_num_result_relations = 0;
estate->es_result_relation_info = NULL;
#ifdef PGXC
- estate->es_result_remoterel = NULL;
+#ifndef XCP
+estate->es_result_remoterel = NULL;
+#endif
#endif
}
@@ -869,6 +914,16 @@ InitPlan(QueryDesc *queryDesc, int eflags)
sp_eflags = eflags & EXEC_FLAG_EXPLAIN_ONLY;
if (bms_is_member(i, plannedstmt->rewindPlanIDs))
sp_eflags |= EXEC_FLAG_REWIND;
+#ifdef XCP
+ /*
+ * Distributed executor may never execute that plan because referencing
+ * subplan is executed on remote node, so we may save some resources.
+ * At the moment only RemoteSubplan is aware of this flag, it is
+ * skipping sending down subplan.
+ * ExecInitSubPlan takes care about finishing initialization.
+ */
+ sp_eflags |= EXEC_FLAG_SUBPLAN;
+#endif
subplanstate = ExecInitNode(subplan, estate, sp_eflags);
@@ -894,7 +949,15 @@ InitPlan(QueryDesc *queryDesc, int eflags)
* Initialize the junk filter if needed. SELECT queries need a filter if
* there are any junk attrs in the top-level tlist.
*/
+#ifdef XCP
+ /*
+ * We need to keep junk attrs in intermediate results, they may be needed
+ * in upper level plans on the receiving side
+ */
+ if (!IS_PGXC_DATANODE && operation == CMD_SELECT)
+#else
if (operation == CMD_SELECT)
+#endif
{
bool junk_filter_needed = false;
ListCell *tlist;
@@ -2244,9 +2307,11 @@ EvalPlanQualStart(EPQState *epqstate, EState *parentestate, Plan *planTree)
estate->es_result_relation_info = parentestate->es_result_relation_info;
#ifdef PGXC
+#ifndef XCP
/* XXX Check if this is OK */
estate->es_result_remoterel = parentestate->es_result_remoterel;
#endif
+#endif
/* es_trig_target_relations must NOT be copied */
estate->es_rowMarks = parentestate->es_rowMarks;
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index b7dc1f311d..7685152285 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -7,6 +7,11 @@
* ExecProcNode, or ExecEndNode on its subnodes and do the appropriate
* processing.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -322,6 +327,12 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
estate, eflags);
break;
#endif
+#ifdef XCP
+ case T_RemoteSubplan:
+ result = (PlanState *) ExecInitRemoteSubplan((RemoteSubplan *) node,
+ estate, eflags);
+ break;
+#endif /* XCP */
default:
elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
@@ -334,6 +345,15 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
* a separate list for us.
*/
subps = NIL;
+#ifdef XCP
+ /*
+ * If plan being initialized during we should skip doing initPlan here.
+ * In case the plan is actually referenced on this step of the distributed
+ * plan it will be done in ExecFinishInitProcNode
+ */
+ if (!(eflags & EXEC_FLAG_SUBPLAN))
+ {
+#endif
foreach(l, node->initPlan)
{
SubPlan *subplan = (SubPlan *) lfirst(l);
@@ -343,6 +363,9 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
sstate = ExecInitSubPlan(subplan, result);
subps = lappend(subps, sstate);
}
+#ifdef XCP
+ }
+#endif
result->initPlan = subps;
/* Set up instrumentation for this node if requested */
@@ -353,6 +376,67 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
}
+#ifdef XCP
+/*
+ * The subplan is referenced on local node, finish initialization
+ */
+void
+ExecFinishInitProcNode(PlanState *node)
+{
+ List *subps;
+ ListCell *l;
+
+ /* Exit if we reached leaf of the tree */
+ if (node == NULL)
+ return;
+
+ /* Special cases */
+ switch (nodeTag(node))
+ {
+ case T_RemoteSubplanState:
+ ExecFinishInitRemoteSubplan((RemoteSubplanState *) node);
+ break;
+
+ case T_AppendState:
+ {
+ AppendState *append = (RemoteSubplanState *) node;
+ int i;
+
+ for (i = 0; i < append->as_nplans; i++)
+ ExecFinishInitProcNode(append->appendplans[i]);
+
+ break;
+ }
+
+ case T_SubqueryScanState:
+ ExecFinishInitProcNode(((SubqueryScanState *) node)->subplan);
+ break;
+
+ default:
+ break;
+ }
+
+ /*
+ * Common case, recurse the tree
+ */
+ ExecFinishInitProcNode(node->lefttree);
+ ExecFinishInitProcNode(node->righttree);
+
+ subps = NIL;
+ foreach(l, node->plan->initPlan)
+ {
+ SubPlan *subplan = (SubPlan *) lfirst(l);
+ SubPlanState *sstate;
+
+ Assert(IsA(subplan, SubPlan));
+ sstate = ExecInitSubPlan(subplan, node);
+ subps = lappend(subps, sstate);
+ }
+ node->initPlan = subps;
+}
+#endif
+
+
/* ----------------------------------------------------------------
* ExecProcNode
*
@@ -513,6 +597,11 @@ ExecProcNode(PlanState *node)
result = ExecRemoteQuery((RemoteQueryState *) node);
break;
#endif
+#ifdef XCP
+ case T_RemoteSubplanState:
+ result = ExecRemoteSubplan((RemoteSubplanState *) node);
+ break;
+#endif /* XCP */
default:
elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
@@ -755,6 +844,11 @@ ExecEndNode(PlanState *node)
ExecEndRemoteQuery((RemoteQueryState *) node);
break;
#endif
+#ifdef XCP
+ case T_RemoteSubplanState:
+ ExecEndRemoteSubplan((RemoteSubplanState *) node);
+ break;
+#endif /* XCP */
default:
elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c
index ad7c569f93..0652e9d34b 100644
--- a/src/backend/executor/execTuples.c
+++ b/src/backend/executor/execTuples.c
@@ -12,6 +12,11 @@
* This information is needed by routines manipulating tuples
* (getattribute, formtuple, etc.).
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -95,7 +100,9 @@
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/typcache.h"
-
+#ifdef XCP
+#include "pgxc/pgxc.h"
+#endif
static TupleDesc ExecTypeFromTLInternal(List *targetList,
bool hasoid, bool skipjunk);
@@ -124,8 +131,13 @@ MakeTupleTableSlot(void)
slot->tts_tupleDescriptor = NULL;
#ifdef PGXC
slot->tts_shouldFreeRow = false;
+#ifdef XCP
+ slot->tts_datarow = NULL;
+ slot->tts_drowcxt = NULL;
+#else
slot->tts_dataRow = NULL;
slot->tts_dataLen = -1;
+#endif
slot->tts_attinmeta = NULL;
#endif
slot->tts_mcxt = CurrentMemoryContext;
@@ -359,13 +371,26 @@ ExecStoreTuple(HeapTuple tuple,
if (slot->tts_shouldFreeMin)
heap_free_minimal_tuple(slot->tts_mintuple);
#ifdef PGXC
+#ifdef XCP
+ if (slot->tts_shouldFreeRow)
+ {
+ pfree(slot->tts_datarow);
+ if (slot->tts_drowcxt)
+ MemoryContextReset(slot->tts_drowcxt);
+ }
+#else
if (slot->tts_shouldFreeRow)
pfree(slot->tts_dataRow);
+#endif
slot->tts_shouldFreeRow = false;
+#ifdef XCP
+ slot->tts_datarow = NULL;
+#else
slot->tts_dataRow = NULL;
slot->tts_dataLen = -1;
#endif
+#endif
/*
* Store the new tuple into the specified slot.
@@ -428,13 +453,26 @@ ExecStoreMinimalTuple(MinimalTuple mtup,
if (slot->tts_shouldFreeMin)
heap_free_minimal_tuple(slot->tts_mintuple);
#ifdef PGXC
+#ifdef XCP
+ if (slot->tts_shouldFreeRow)
+ {
+ pfree(slot->tts_datarow);
+ if (slot->tts_drowcxt)
+ MemoryContextReset(slot->tts_drowcxt);
+ }
+#else
if (slot->tts_shouldFreeRow)
pfree(slot->tts_dataRow);
+#endif
slot->tts_shouldFreeRow = false;
+#ifdef XCP
+ slot->tts_datarow = NULL;
+#else
slot->tts_dataRow = NULL;
slot->tts_dataLen = -1;
#endif
+#endif
/*
* Drop the pin on the referenced buffer, if there is one.
@@ -487,13 +525,22 @@ ExecClearTuple(TupleTableSlot *slot) /* slot in which to store tuple */
if (slot->tts_shouldFreeMin)
heap_free_minimal_tuple(slot->tts_mintuple);
#ifdef PGXC
+#ifdef XCP
+ if (slot->tts_shouldFreeRow)
+ pfree(slot->tts_datarow);
+#else
if (slot->tts_shouldFreeRow)
pfree(slot->tts_dataRow);
+#endif
slot->tts_shouldFreeRow = false;
+#ifdef XCP
+ slot->tts_datarow = NULL;
+#else
slot->tts_dataRow = NULL;
slot->tts_dataLen = -1;
#endif
+#endif
slot->tts_tuple = NULL;
slot->tts_mintuple = NULL;
@@ -605,9 +652,14 @@ ExecCopySlotTuple(TupleTableSlot *slot)
/*
* Ensure values are extracted from data row to the Datum array
*/
+#ifdef XCP
+ if (slot->tts_datarow)
+ slot_getallattrs(slot);
+#else
if (slot->tts_dataRow)
slot_getallattrs(slot);
#endif
+#endif
/*
* Otherwise we need to build a tuple from the Datum array.
*/
@@ -644,9 +696,14 @@ ExecCopySlotMinimalTuple(TupleTableSlot *slot)
/*
* Ensure values are extracted from data row to the Datum array
*/
+#ifdef XCP
+ if (slot->tts_datarow)
+ slot_getallattrs(slot);
+#else
if (slot->tts_dataRow)
slot_getallattrs(slot);
#endif
+#endif
/*
* Otherwise we need to build a tuple from the Datum array.
*/
@@ -655,6 +712,191 @@ ExecCopySlotMinimalTuple(TupleTableSlot *slot)
slot->tts_isnull);
}
+#ifdef PGXC
+#ifdef XCP
+/* --------------------------------
+ * ExecCopySlotDatarow
+ * Obtain a copy of a slot's data row. The copy is
+ * palloc'd in the current memory context.
+ * The slot itself is undisturbed
+ * --------------------------------
+ */
+RemoteDataRow
+ExecCopySlotDatarow(TupleTableSlot *slot, MemoryContext tmpcxt)
+{
+ RemoteDataRow datarow;
+ if (slot->tts_datarow)
+ {
+ int len = slot->tts_datarow->msglen;
+ /* if we already have datarow make a copy */
+ datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + len);
+ datarow->msgnode = slot->tts_datarow->msgnode;
+ datarow->msglen = len;
+ memcpy(datarow->msg, slot->tts_datarow->msg, len);
+ return datarow;
+ }
+ else
+ {
+ TupleDesc tdesc = slot->tts_tupleDescriptor;
+ MemoryContext savecxt = NULL;
+ StringInfoData buf;
+ uint16 n16;
+ int i;
+
+ /* ensure we have all values */
+ slot_getallattrs(slot);
+
+ /* if temporary memory context is specified reset it */
+ if (tmpcxt)
+ {
+ MemoryContextReset(tmpcxt);
+ savecxt = MemoryContextSwitchTo(tmpcxt);
+ }
+
+ initStringInfo(&buf);
+ /* Number of parameter values */
+ n16 = htons(tdesc->natts);
+ appendBinaryStringInfo(&buf, (char *) &n16, 2);
+
+ for (i = 0; i < tdesc->natts; i++)
+ {
+ uint32 n32;
+
+ if (slot->tts_isnull[i])
+ {
+ n32 = htonl(-1);
+ appendBinaryStringInfo(&buf, (char *) &n32, 4);
+ }
+ else
+ {
+ Form_pg_attribute attr = tdesc->attrs[i];
+ Oid typOutput;
+ bool typIsVarlena;
+ Datum pval;
+ char *pstring;
+ int len;
+
+ /* Get info needed to output the value */
+ getTypeOutputInfo(attr->atttypid, &typOutput, &typIsVarlena);
+ /*
+ * If we have a toasted datum, forcibly detoast it here to avoid
+ * memory leakage inside the type's output routine.
+ */
+ if (typIsVarlena)
+ pval = PointerGetDatum(PG_DETOAST_DATUM(slot->tts_values[i]));
+ else
+ pval = slot->tts_values[i];
+
+ /* Convert Datum to string */
+ pstring = OidOutputFunctionCall(typOutput, pval);
+
+ /* copy data to the buffer */
+ len = strlen(pstring);
+ n32 = htonl(len);
+ appendBinaryStringInfo(&buf, (char *) &n32, 4);
+ appendBinaryStringInfo(&buf, pstring, len);
+ }
+ }
+
+ /* restore memory context to allocate result */
+ if (savecxt)
+ {
+ MemoryContextSwitchTo(savecxt);
+ }
+
+ /* copy data to the buffer */
+ datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + buf.len);
+ datarow->msgnode = InvalidOid;
+ datarow->msglen = buf.len;
+ memcpy(datarow->msg, buf.data, buf.len);
+ pfree(buf.data);
+ return datarow;
+ }
+}
+#else
+/* --------------------------------
+ * ExecCopySlotDatarow
+ * Obtain a copy of a slot's data row. The copy is
+ * palloc'd in the current memory context.
+ * Pointer to the datarow is returned as a var parameter, function
+ * returns the length of the data row
+ * The slot itself is undisturbed
+ * --------------------------------
+ */
+int
+ExecCopySlotDatarow(TupleTableSlot *slot, char **datarow)
+{
+ Assert(datarow);
+
+ if (slot->tts_dataRow)
+ {
+ /* if we already have datarow make a copy */
+ *datarow = (char *)palloc(slot->tts_dataLen);
+ memcpy(*datarow, slot->tts_dataRow, slot->tts_dataLen);
+ return slot->tts_dataLen;
+ }
+ else
+ {
+ TupleDesc tdesc = slot->tts_tupleDescriptor;
+ StringInfoData buf;
+ uint16 n16;
+ int i;
+
+ initStringInfo(&buf);
+ /* Number of parameter values */
+ n16 = htons(tdesc->natts);
+ appendBinaryStringInfo(&buf, (char *) &n16, 2);
+
+ /* ensure we have all values */
+ slot_getallattrs(slot);
+ for (i = 0; i < tdesc->natts; i++)
+ {
+ uint32 n32;
+
+ if (slot->tts_isnull[i])
+ {
+ n32 = htonl(-1);
+ appendBinaryStringInfo(&buf, (char *) &n32, 4);
+ }
+ else
+ {
+ Form_pg_attribute attr = tdesc->attrs[i];
+ Oid typOutput;
+ bool typIsVarlena;
+ Datum pval;
+ char *pstring;
+ int len;
+
+ /* Get info needed to output the value */
+ getTypeOutputInfo(attr->atttypid, &typOutput, &typIsVarlena);
+ /*
+ * If we have a toasted datum, forcibly detoast it here to avoid
+ * memory leakage inside the type's output routine.
+ */
+ if (typIsVarlena)
+ pval = PointerGetDatum(PG_DETOAST_DATUM(slot->tts_values[i]));
+ else
+ pval = slot->tts_values[i];
+
+ /* Convert Datum to string */
+ pstring = OidOutputFunctionCall(typOutput, pval);
+
+ /* copy data to the buffer */
+ len = strlen(pstring);
+ n32 = htonl(len);
+ appendBinaryStringInfo(&buf, (char *) &n32, 4);
+ appendBinaryStringInfo(&buf, pstring, len);
+ }
+ }
+ /* copy data to the buffer */
+ *datarow = palloc(buf.len);
+ memcpy(*datarow, buf.data, buf.len);
+ pfree(buf.data);
+ return buf.len;
+ }
+}
+#endif
+#endif
/* --------------------------------
* ExecFetchSlotTuple
@@ -844,8 +1086,12 @@ ExecMaterializeSlot(TupleTableSlot *slot)
#ifdef PGXC
if (!slot->tts_shouldFreeRow)
{
+#ifdef XCP
+ slot->tts_datarow = NULL;
+#else
slot->tts_dataRow = NULL;
slot->tts_dataLen = -1;
+#endif
}
#endif
@@ -1288,6 +1534,58 @@ end_tup_output(TupOutputState *tstate)
*
* --------------------------------
*/
+#ifdef XCP
+TupleTableSlot *
+ExecStoreDataRowTuple(RemoteDataRow datarow,
+ TupleTableSlot *slot,
+ bool shouldFree)
+{
+ /*
+ * sanity checks
+ */
+ Assert(datarow != NULL);
+ Assert(slot != NULL);
+ Assert(slot->tts_tupleDescriptor != NULL);
+
+ /*
+ * Free any old physical tuple belonging to the slot.
+ */
+ if (slot->tts_shouldFree)
+ heap_freetuple(slot->tts_tuple);
+ if (slot->tts_shouldFreeMin)
+ heap_free_minimal_tuple(slot->tts_mintuple);
+ if (slot->tts_shouldFreeRow)
+ {
+ pfree(slot->tts_datarow);
+ if (slot->tts_drowcxt)
+ MemoryContextReset(slot->tts_drowcxt);
+ }
+
+ /*
+ * Drop the pin on the referenced buffer, if there is one.
+ */
+ if (BufferIsValid(slot->tts_buffer))
+ ReleaseBuffer(slot->tts_buffer);
+
+ slot->tts_buffer = InvalidBuffer;
+
+ /*
+ * Store the new tuple into the specified slot.
+ */
+ slot->tts_isempty = false;
+ slot->tts_shouldFree = false;
+ slot->tts_shouldFreeMin = false;
+ slot->tts_shouldFreeRow = shouldFree;
+ slot->tts_tuple = NULL;
+ slot->tts_mintuple = NULL;
+ slot->tts_datarow = datarow;
+
+ /* Mark extracted state invalid */
+ slot->tts_nvalid = 0;
+
+ return slot;
+}
+#else
TupleTableSlot *
ExecStoreDataRowTuple(char *msg, size_t len, TupleTableSlot *slot,
bool shouldFree)
@@ -1343,3 +1641,4 @@ ExecStoreDataRowTuple(char *msg, size_t len, TupleTableSlot *slot,
return slot;
}
#endif
+#endif
diff --git a/src/backend/executor/execUtils.c b/src/backend/executor/execUtils.c
index ac39b1fbeb..f8fadb26d9 100644
--- a/src/backend/executor/execUtils.c
+++ b/src/backend/executor/execUtils.c
@@ -3,6 +3,11 @@
* execUtils.c
* miscellaneous executor utility routines
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -117,9 +122,11 @@ CreateExecutorState(void)
estate->es_result_relations = NULL;
estate->es_num_result_relations = 0;
estate->es_result_relation_info = NULL;
-#ifdef PGXC
+#ifdef PGXC
+#ifndef XCP
estate->es_result_remoterel = NULL;
#endif
+#endif
estate->es_trig_target_relations = NIL;
estate->es_trig_tuple_slot = NULL;
diff --git a/src/backend/executor/functions.c b/src/backend/executor/functions.c
index e6b57539b4..6af20e10dd 100644
--- a/src/backend/executor/functions.c
+++ b/src/backend/executor/functions.c
@@ -491,6 +491,24 @@ init_execution_state(List *queryTree_list,
errmsg("%s is not allowed in a non-volatile function",
CreateCommandTag(stmt))));
+#ifdef PGXC
+ if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
+ {
+ if (queryTree->commandType != CMD_UTILITY)
+ {
+ /*
+ * The parameterised queries in RemoteQuery nodes will be prepared
+ * on the Datanode, and need parameter types for the same. Set the
+ * parameter types and their number in all RemoteQuery nodes in the
+ * plan
+ */
+ SetRemoteStatementName(((PlannedStmt *)stmt)->planTree, NULL,
+ fcache->pinfo->nargs,
+ fcache->pinfo->argtypes, 0);
+ }
+ }
+#endif /* PGXC */
+
/* OK, build the execution_state for this query */
newes = (execution_state *) palloc(sizeof(execution_state));
if (preves)
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index f94d7452d1..dc16e88454 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -67,6 +67,11 @@
* but direct examination of the node is needed to use it before 9.0.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -179,9 +184,15 @@ typedef struct AggStatePerAggData
*/
int16 inputtypeLen,
resulttypeLen,
+#ifdef XCP
+ collecttypeLen,
+#endif
transtypeLen;
bool inputtypeByVal,
resulttypeByVal,
+#ifdef XCP
+ collecttypeByVal,
+#endif
transtypeByVal;
/*
@@ -520,6 +531,7 @@ advance_transition_function(AggState *aggstate,
}
#ifdef PGXC
+#ifndef XCP
/*
* Given new input value(s), advance the collection function of an aggregate.
*
@@ -624,6 +636,7 @@ advance_collection_function(AggState *aggstate,
MemoryContextSwitchTo(oldContext);
}
+#endif /* XCP */
#endif /* PGXC */
/*
@@ -696,6 +709,7 @@ advance_aggregates(AggState *aggstate, AggStatePerGroup pergroup)
}
#ifdef PGXC
+#ifndef XCP
if (aggstate->skip_trans)
{
Assert(IS_PGXC_COORDINATOR);
@@ -707,6 +721,7 @@ advance_aggregates(AggState *aggstate, AggStatePerGroup pergroup)
pergroupstate, &fcinfo);
}
else
+#endif /* XCP */
#endif /* PGXC */
advance_transition_function(aggstate, peraggstate, pergroupstate,
&fcinfo);
@@ -906,8 +921,52 @@ finalize_aggregate(AggState *aggstate,
Datum *resultVal, bool *resultIsNull)
{
MemoryContext oldContext;
+#ifdef XCP
+ Datum value;
+ bool isnull;
+#endif
oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
+#ifdef XCP
+ if (OidIsValid(peraggstate->collectfn_oid))
+ {
+ FunctionCallInfoData fcinfo;
+ InitFunctionCallInfoData(fcinfo, &(peraggstate->collectfn), 2,
+ peraggstate->aggCollation,
+ (void *) aggstate, NULL);
+ fcinfo.arg[1] = pergroupstate->transValue;
+ fcinfo.argnull[1] = pergroupstate->transValueIsNull;
+ if (fcinfo.flinfo->fn_strict &&
+ (peraggstate->initCollectValueIsNull || pergroupstate->transValueIsNull))
+ {
+ /*
+ * We have already checked the collection and transition types are
+ * binary compatible, so we can just copy the value.
+ */
+ value = pergroupstate->transValue;
+ isnull = pergroupstate->transValueIsNull;
+ }
+ else
+ {
+ /*
+ * copy the initial datum since it might get changed inside the
+ * collection function
+ */
+ fcinfo.arg[0] = datumCopy(peraggstate->initCollectValue,
+ peraggstate->collecttypeByVal,
+ peraggstate->collecttypeLen);
+ fcinfo.argnull[0] = peraggstate->initCollectValueIsNull;
+ value = FunctionCallInvoke(&fcinfo);
+ isnull = fcinfo.isnull;
+ }
+ }
+ else
+ {
+ /* No collect function, just use transition values to finalize */
+ value = pergroupstate->transValue;
+ isnull = pergroupstate->transValueIsNull;
+ }
+#else
#ifdef PGXC
/*
* if we skipped the transition phase, we have the collection result in the
@@ -919,6 +978,7 @@ finalize_aggregate(AggState *aggstate,
pergroupstate->transValueIsNull = pergroupstate->collectValueIsNull;
}
#endif /* PGXC */
+#endif /* XCP */
/*
* Apply the agg's finalfn if one is provided, else return transValue.
@@ -930,9 +990,15 @@ finalize_aggregate(AggState *aggstate,
InitFunctionCallInfoData(fcinfo, &(peraggstate->finalfn), 1,
peraggstate->aggCollation,
(void *) aggstate, NULL);
+#ifdef XCP
+ fcinfo.arg[0] = value;
+ fcinfo.argnull[0] = isnull;
+ if (fcinfo.flinfo->fn_strict && isnull)
+#else
fcinfo.arg[0] = pergroupstate->transValue;
fcinfo.argnull[0] = pergroupstate->transValueIsNull;
if (fcinfo.flinfo->fn_strict && pergroupstate->transValueIsNull)
+#endif /* XCP */
{
/* don't call a strict function with NULL inputs */
*resultVal = (Datum) 0;
@@ -946,8 +1012,13 @@ finalize_aggregate(AggState *aggstate,
}
else
{
+#ifdef XCP
+ *resultVal = value;
+ *resultIsNull = isnull;
+#else
*resultVal = pergroupstate->transValue;
*resultIsNull = pergroupstate->transValueIsNull;
+#endif /* XCP */
}
/*
@@ -1549,6 +1620,7 @@ agg_retrieve_hash_table(AggState *aggstate)
return NULL;
}
+
/* -----------------
* ExecInitAgg
*
@@ -1586,7 +1658,9 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
aggstate->pergroup = NULL;
aggstate->grp_firstTuple = NULL;
aggstate->hashtable = NULL;
+#ifndef XCP
aggstate->skip_trans = node->skip_trans;
+#endif
/*
* Create expression contexts. We need two, one for per-input-tuple
@@ -1743,6 +1817,9 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
HeapTuple aggTuple;
Form_pg_aggregate aggform;
Oid aggtranstype;
+#ifdef XCP
+ Oid aggcollecttype;
+#endif /* XCP */
AclResult aclresult;
Oid transfn_oid,
finalfn_oid;
@@ -1819,6 +1896,26 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
#ifdef PGXC
peraggstate->collectfn_oid = collectfn_oid = aggform->aggcollectfn;
+#ifdef XCP
+ /*
+ * If preparing PHASE1 skip finalization step and return transmission
+ * value to be collected and finalized on master node.
+ * If preparing PHASE2 move collection function into transition slot,
+ * so master node collected transition values and finalithed them.
+ * Otherwise (one-node aggregation) do all steps locally, the collection
+ * function will just convert transient value for finalization function.
+ */
+ if (node->aggdistribution == AGG_SLAVE)
+ {
+ peraggstate->collectfn_oid = collectfn_oid = InvalidOid;
+ peraggstate->finalfn_oid = finalfn_oid = InvalidOid;
+ }
+ else if (node->aggdistribution == AGG_MASTER)
+ {
+ peraggstate->transfn_oid = transfn_oid = collectfn_oid;
+ peraggstate->collectfn_oid = collectfn_oid = InvalidOid;
+ }
+#else
/*
* For PGXC final and collection functions are used to combine results at Coordinator,
* disable those for Datanode
@@ -1828,6 +1925,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
peraggstate->finalfn_oid = finalfn_oid = InvalidOid;
peraggstate->collectfn_oid = collectfn_oid = InvalidOid;
}
+#endif /* XCP */
#endif /* PGXC */
/* Check that aggregate owner has permission to call component fns */
{
@@ -1869,6 +1967,15 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
}
/* resolve actual type of transition state, if polymorphic */
+#ifdef XCP
+ /*
+ * We substitute function for PHASE2 and should take collection type
+ * as transient
+ */
+ if (node->aggdistribution == AGG_MASTER)
+ aggtranstype = aggform->aggcollecttype;
+ else
+#endif /* XCP */
aggtranstype = aggform->aggtranstype;
if (IsPolymorphicType(aggtranstype))
{
@@ -1886,18 +1993,34 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
false);
pfree(declaredArgTypes);
}
-
+#ifdef XCP
+ /* get type of collection state, if defined */
+ if (OidIsValid(collectfn_oid))
+ aggcollecttype = aggform->aggcollecttype;
+ else
+ aggcollecttype = InvalidOid;
+#endif
/* build expression trees using actual argument & result types */
build_aggregate_fnexprs(inputTypes,
numArguments,
aggtranstype,
+#ifdef XCP
+ aggcollecttype,
+#endif
aggref->aggtype,
aggref->inputcollid,
transfn_oid,
+#ifdef XCP
+ collectfn_oid,
+#endif
finalfn_oid,
&transfnexpr,
+#ifdef XCP
+ &collectfnexpr,
+#endif
&finalfnexpr);
#ifdef PGXC
+#ifndef XCP
if (OidIsValid(collectfn_oid))
{
/* we expect final function expression to be NULL in call to
@@ -1923,6 +2046,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
&dummyexpr);
Assert(!dummyexpr);
}
+#endif /* XCP */
#endif /* PGXC */
fmgr_info(transfn_oid, &peraggstate->transfn);
@@ -1949,11 +2073,27 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
get_typlenbyval(aggtranstype,
&peraggstate->transtypeLen,
&peraggstate->transtypeByVal);
+#ifdef XCP
+ if (OidIsValid(aggcollecttype))
+ get_typlenbyval(aggcollecttype,
+ &peraggstate->collecttypeLen,
+ &peraggstate->collecttypeByVal);
+#endif /* XCP */
/*
* initval is potentially null, so don't try to access it as a struct
* field. Must do it the hard way with SysCacheGetAttr.
*/
+#ifdef XCP
+ /*
+ * If this is Phase2 get collect initial value instead
+ */
+ if (node->aggdistribution == AGG_MASTER)
+ textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
+ Anum_pg_aggregate_agginitcollect,
+ &peraggstate->initValueIsNull);
+ else
+#endif /* XCP */
textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
Anum_pg_aggregate_agginitval,
&peraggstate->initValueIsNull);
@@ -1970,6 +2110,34 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
* access it as a struct field. Must do it the hard way with
* SysCacheGetAttr.
*/
+#ifdef XCP
+ if (OidIsValid(aggcollecttype))
+ {
+ textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
+ Anum_pg_aggregate_agginitcollect,
+ &peraggstate->initCollectValueIsNull);
+ if (peraggstate->initCollectValueIsNull)
+ peraggstate->initCollectValue = (Datum) 0;
+ else
+ peraggstate->initCollectValue = GetAggInitVal(textInitVal,
+ aggcollecttype);
+ /*
+ * If the collectfn is strict and the initval is NULL, make sure
+ * transtype and collecttype are the same (or at least
+ * binary-compatible), so that it's OK to use the transition value
+ * as the initial collectValue. This should have been checked at agg
+ * definition time, but just in case...
+ */
+ if (peraggstate->collectfn.fn_strict && peraggstate->initValueIsNull)
+ {
+ if (!IsBinaryCoercible(aggtranstype, aggcollecttype))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION),
+ errmsg("aggregate %u needs to have compatible transition type and collection type",
+ aggref->aggfnoid)));
+ }
+ }
+#else
textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple,
Anum_pg_aggregate_agginitcollect,
&peraggstate->initCollectValueIsNull);
@@ -1979,6 +2147,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
else
peraggstate->initCollectValue = GetAggInitVal(textInitVal,
aggtranstype);
+#endif /* XCP */
#endif /* PGXC */
/*
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index 0026364376..b59c847752 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -3,6 +3,11 @@
* nodeModifyTable.c
* routines to handle ModifyTable nodes.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -44,9 +49,11 @@
#include "miscadmin.h"
#include "nodes/nodeFuncs.h"
#ifdef PGXC
+#ifndef XCP
#include "pgxc/execRemote.h"
#include "pgxc/pgxc.h"
#endif
+#endif
#include "storage/bufmgr.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
@@ -173,9 +180,11 @@ ExecInsert(TupleTableSlot *slot,
Relation resultRelationDesc;
Oid newId;
List *recheckIndexes = NIL;
-#ifdef PGXC
+#ifdef PGXC
+#ifndef XCP
PlanState *resultRemoteRel = NULL;
#endif
+#endif
/*
* get the heap tuple out of the tuple table slot, making sure we have a
@@ -188,9 +197,11 @@ ExecInsert(TupleTableSlot *slot,
*/
resultRelInfo = estate->es_result_relation_info;
resultRelationDesc = resultRelInfo->ri_RelationDesc;
-#ifdef PGXC
+#ifdef PGXC
+#ifndef XCP
resultRemoteRel = estate->es_result_remoterel;
#endif
+#endif
/*
* If the result relation has OIDs, force the tuple's OID to zero so that
* heap_insert will assign a fresh OID. Usually the OID already will be
@@ -242,9 +253,11 @@ ExecInsert(TupleTableSlot *slot,
ExecConstraints(resultRelInfo, slot, estate);
#ifdef PGXC
+#ifndef XCP
if (IS_PGXC_COORDINATOR && resultRemoteRel)
{
- slot = ExecProcNodeDMLInXC((RemoteQueryState *)resultRemoteRel, slot);
+ ExecRemoteQueryStandard(resultRelationDesc, (RemoteQueryState *)resultRemoteRel, slot);
+
/*
* PGXCTODO: If target table uses WITH OIDS, this should be set to the Oid inserted
* but Oids are not consistent among nodes in Postgres-XC, so this is set to the
@@ -254,7 +267,8 @@ ExecInsert(TupleTableSlot *slot,
newId = InvalidOid;
}
else
-#endif
+#endif
+#endif
{
/*
* insert the tuple
@@ -276,55 +290,6 @@ ExecInsert(TupleTableSlot *slot,
if (canSetTag)
{
-#ifdef PGXC
- if (IS_PGXC_COORDINATOR && resultRelInfo->ri_projectReturning)
- {
- /*
- * Consider this example
- *
- * CREATE TABLE bar(c3 int, c4 int);
- * INSERT INTO bar VALUES(123,456);
- * INSERT INTO bar VALUES(123,789);
- *
- * CREATE TABLE foo (c1 int, c2 int);
- * INSERT INTO foo VALUES (1,2), (3,4);
- * Consider this join query
- * select f.ctid, b.ctid, * from foo f, bar b where f.c1+122=b.c3;
- * Note it returned TWO rows
- * ctid | ctid | c1 | c2 | c3 | c4
- * -------+-------+----+----+-----+-----
- * (0,1) | (0,1) | 1 | 2 | 123 | 456
- * (0,1) | (0,2) | 1 | 2 | 123 | 789
- * (2 rows)
- *
- * Now consider the update with the same join condition
- *
- * update foo set c2=c2*2 from bar b
- * WHERE foo.c1+122 = b.c3 RETURNING *, foo.ctid;
- *
- * The update would run twice since we got two rows from the join.
- * When the first update runs it will change the ctid of the row
- * to be updated and would return the updated row with ctid say (0,3).
- * The second update would not update any row since the row with
- * ctid (0,1) would no more exist in foo, it would therefore return
- * an empty slot.
- *
- * update foo set c2=c2*2 from bar b
- * WHERE foo.c1+122 = b.c3 RETURNING *, foo.ctid;
- * f1 | f2 | f3 | q1 | q2 | ctid
- * ----+------+----+-----+------------------+-------
- * 1 | test | 84 | 123 | 4567890123456789 | (0,3)
- * (1 row)
- *
- * It is therefore possible in ExecInsert/Update/Delete
- * to receive an empty slot, and we have to add checks
- * before we can update the processed tuple count.
- */
- if (!TupIsNull(slot))
- (estate->es_processed)++;
- }
- else
-#endif
(estate->es_processed)++;
estate->es_lastoid = newId;
setLastTid(&(tuple->t_self));
@@ -337,16 +302,8 @@ ExecInsert(TupleTableSlot *slot,
/* Process RETURNING if present */
if (resultRelInfo->ri_projectReturning)
-#ifdef PGXC
- {
- if (TupIsNull(slot))
- return NULL;
-#endif
return ExecProcessReturning(resultRelInfo->ri_projectReturning,
slot, planSlot);
-#ifdef PGXC
- }
-#endif
return NULL;
}
@@ -379,8 +336,9 @@ ExecDelete(ItemPointer tupleid,
ItemPointerData update_ctid;
TransactionId update_xmax;
#ifdef PGXC
+#ifndef XCP
PlanState *resultRemoteRel = NULL;
- TupleTableSlot *slot;
+#endif
#endif
/*
@@ -389,8 +347,10 @@ ExecDelete(ItemPointer tupleid,
resultRelInfo = estate->es_result_relation_info;
resultRelationDesc = resultRelInfo->ri_RelationDesc;
#ifdef PGXC
+#ifndef XCP
resultRemoteRel = estate->es_result_remoterel;
#endif
+#endif
/* BEFORE ROW DELETE Triggers */
if (resultRelInfo->ri_TrigDesc &&
@@ -439,13 +399,15 @@ ExecDelete(ItemPointer tupleid,
*/
ldelete:;
#ifdef PGXC
+#ifndef XCP
if (IS_PGXC_COORDINATOR && resultRemoteRel)
{
- slot = ExecProcNodeDMLInXC((RemoteQueryState *)resultRemoteRel, planSlot);
+ ExecRemoteQueryStandard(resultRelationDesc, (RemoteQueryState *)resultRemoteRel, planSlot);
}
else
{
#endif
+#endif
result = heap_delete(resultRelationDesc, tupleid,
&update_ctid, &update_xmax,
estate->es_output_cid,
@@ -499,48 +461,27 @@ ldelete:;
*/
#ifdef PGXC
+#ifndef XCP
}
#endif
+#endif
}
if (canSetTag)
-#ifdef PGXC
- {
- if (IS_PGXC_COORDINATOR && resultRelInfo->ri_projectReturning)
- {
- /* For reason see comments in ExecInsert */
- if (!TupIsNull(slot))
- (estate->es_processed)++;
- }
- else
-#endif
(estate->es_processed)++;
-#ifdef PGXC
- }
-#endif
#ifdef PGXC
+#ifndef XCP
/*
* Do not fire triggers on remote relation, it would not find old tuple
*/
if (resultRemoteRel == NULL)
#endif
+#endif
/* AFTER ROW DELETE Triggers */
ExecARDeleteTriggers(estate, resultRelInfo, tupleid);
/* Process RETURNING if present */
-#ifdef PGXC
- if (resultRelInfo->ri_projectReturning && resultRemoteRel != NULL &&
- IS_PGXC_COORDINATOR && !IsConnFromCoord())
- {
- if (TupIsNull(slot))
- return NULL;
-
- return ExecProcessReturning(resultRelInfo->ri_projectReturning,
- slot, planSlot);
- }
- else
-#endif
if (resultRelInfo->ri_projectReturning)
{
/*
@@ -623,8 +564,10 @@ ExecUpdate(ItemPointer tupleid,
TransactionId update_xmax;
List *recheckIndexes = NIL;
#ifdef PGXC
+#ifndef XCP
PlanState *resultRemoteRel = NULL;
#endif
+#endif
/*
* abort the operation if not running transactions
@@ -644,8 +587,10 @@ ExecUpdate(ItemPointer tupleid,
resultRelInfo = estate->es_result_relation_info;
resultRelationDesc = resultRelInfo->ri_RelationDesc;
#ifdef PGXC
+#ifndef XCP
resultRemoteRel = estate->es_result_remoterel;
#endif
+#endif
/* BEFORE ROW UPDATE Triggers */
if (resultRelInfo->ri_TrigDesc &&
@@ -701,13 +646,15 @@ lreplace:;
ExecConstraints(resultRelInfo, slot, estate);
#ifdef PGXC
+#ifndef XCP
if (IS_PGXC_COORDINATOR && resultRemoteRel)
{
- slot = ExecProcNodeDMLInXC((RemoteQueryState *)resultRemoteRel, planSlot);
+ ExecRemoteQueryStandard(resultRelationDesc, (RemoteQueryState *)resultRemoteRel, planSlot);
}
else
{
#endif
+#endif
/*
* replace the heap tuple
*
@@ -782,32 +729,23 @@ lreplace:;
recheckIndexes = ExecInsertIndexTuples(slot, &(tuple->t_self),
estate);
#ifdef PGXC
+#ifndef XCP
}
#endif
+#endif
}
if (canSetTag)
-#ifdef PGXC
- {
- if (IS_PGXC_COORDINATOR && resultRelInfo->ri_projectReturning)
- {
- /* For reason see comments in ExecInsert */
- if (!TupIsNull(slot))
- (estate->es_processed)++;
- }
- else
-#endif
(estate->es_processed)++;
-#ifdef PGXC
- }
-#endif
#ifdef PGXC
+#ifndef XCP
/*
* Do not fire triggers on remote relation, it would not find old tuple
*/
if (resultRemoteRel == NULL)
#endif
+#endif
/* AFTER ROW UPDATE Triggers */
ExecARUpdateTriggers(estate, resultRelInfo, tupleid, tuple,
recheckIndexes);
@@ -816,16 +754,8 @@ lreplace:;
/* Process RETURNING if present */
if (resultRelInfo->ri_projectReturning)
-#ifdef PGXC
- {
- if (TupIsNull(slot))
- return NULL;
-#endif
return ExecProcessReturning(resultRelInfo->ri_projectReturning,
slot, planSlot);
-#ifdef PGXC
- }
-#endif
return NULL;
}
@@ -893,10 +823,12 @@ ExecModifyTable(ModifyTableState *node)
ResultRelInfo *saved_resultRelInfo;
ResultRelInfo *resultRelInfo;
PlanState *subplanstate;
-#ifdef PGXC
+#ifdef PGXC
+#ifndef XCP
PlanState *remoterelstate;
PlanState *saved_resultRemoteRel;
-#endif
+#endif
+#endif
JunkFilter *junkfilter;
TupleTableSlot *slot;
TupleTableSlot *planSlot;
@@ -938,9 +870,10 @@ ExecModifyTable(ModifyTableState *node)
resultRelInfo = node->resultRelInfo + node->mt_whichplan;
subplanstate = node->mt_plans[node->mt_whichplan];
#ifdef PGXC
- /* Initialize remote plan state */
+#ifndef XCP
remoterelstate = node->mt_remoterels[node->mt_whichplan];
#endif
+#endif
junkfilter = resultRelInfo->ri_junkFilter;
/*
@@ -952,13 +885,17 @@ ExecModifyTable(ModifyTableState *node)
*/
saved_resultRelInfo = estate->es_result_relation_info;
#ifdef PGXC
+#ifndef XCP
saved_resultRemoteRel = estate->es_result_remoterel;
#endif
+#endif
estate->es_result_relation_info = resultRelInfo;
#ifdef PGXC
+#ifndef XCP
estate->es_result_remoterel = remoterelstate;
#endif
+#endif
/*
* Fetch rows from subplan(s), and execute the required table modification
@@ -985,9 +922,11 @@ ExecModifyTable(ModifyTableState *node)
resultRelInfo++;
subplanstate = node->mt_plans[node->mt_whichplan];
#ifdef PGXC
+#ifndef XCP
/* Move to next remote plan */
estate->es_result_remoterel = node->mt_remoterels[node->mt_whichplan];
- remoterelstate = node->mt_remoterels[node->mt_whichplan];
+ remoterelstate = node->mt_plans[node->mt_whichplan];
+#endif
#endif
junkfilter = resultRelInfo->ri_junkFilter;
estate->es_result_relation_info = resultRelInfo;
@@ -1045,9 +984,7 @@ ExecModifyTable(ModifyTableState *node)
if (operation != CMD_DELETE)
slot = ExecFilterJunk(junkfilter, slot);
}
-#ifdef PGXC
- estate->es_result_remoterel = remoterelstate;
-#endif
+
switch (operation)
{
case CMD_INSERT:
@@ -1073,18 +1010,17 @@ ExecModifyTable(ModifyTableState *node)
if (slot)
{
estate->es_result_relation_info = saved_resultRelInfo;
-#ifdef PGXC
- estate->es_result_remoterel = saved_resultRemoteRel;
-#endif
return slot;
}
}
/* Restore es_result_relation_info before exiting */
estate->es_result_relation_info = saved_resultRelInfo;
-#ifdef PGXC
+#ifdef PGXC
+#ifndef XCP
estate->es_result_remoterel = saved_resultRemoteRel;
-#endif
+#endif
+#endif
/*
* We're done, but fire AFTER STATEMENT triggers before exiting.
@@ -1112,9 +1048,6 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
Plan *subplan;
ListCell *l;
int i;
-#ifdef PGXC
- PlanState *saved_remoteRelInfo;
-#endif
/* check for unsupported flags */
Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
@@ -1132,9 +1065,11 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
mtstate->mt_done = false;
mtstate->mt_plans = (PlanState **) palloc0(sizeof(PlanState *) * nplans);
-#ifdef PGXC
+#ifdef PGXC
+#ifndef XCP
mtstate->mt_remoterels = (PlanState **) palloc0(sizeof(PlanState *) * nplans);
-#endif
+#endif
+#endif
mtstate->resultRelInfo = estate->es_result_relations + node->resultRelIndex;
mtstate->mt_arowmarks = (List **) palloc0(sizeof(List *) * nplans);
mtstate->mt_nplans = nplans;
@@ -1152,24 +1087,25 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
* sub-plan; ExecContextForcesOids depends on that!
*/
saved_resultRelInfo = estate->es_result_relation_info;
-#ifdef PGXC
- saved_remoteRelInfo = estate->es_result_remoterel;
-#endif
resultRelInfo = mtstate->resultRelInfo;
i = 0;
foreach(l, node->plans)
{
-#ifdef PGXC
+#ifdef PGXC
+#ifndef XCP
Plan *remoteplan = NULL;
-#endif
+#endif
+#endif
subplan = (Plan *) lfirst(l);
-#ifdef PGXC
+#ifdef PGXC
+#ifndef XCP
if (node->remote_plans)
remoteplan = list_nth(node->remote_plans, i);
-#endif
+#endif
+#endif
/*
* Verify result relation is a valid target for the current operation
@@ -1195,24 +1131,23 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
mtstate->mt_plans[i] = ExecInitNode(subplan, estate, eflags);
#ifdef PGXC
+#ifndef XCP
if (remoteplan)
{
- /*
+ /*
* Init the plan for the remote execution for this result rel. This is
* used to execute data modification queries on the remote nodes
*/
mtstate->mt_remoterels[i] = ExecInitNode(remoteplan, estate, eflags);
}
-#endif
+#endif
+#endif
resultRelInfo++;
i++;
}
estate->es_result_relation_info = saved_resultRelInfo;
-#ifdef PGXC
- estate->es_result_remoterel = saved_remoteRelInfo;
-#endif
/*
* Initialize RETURNING projections if needed.
diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c
index 9072581c05..bc27d861ad 100644
--- a/src/backend/executor/nodeSort.c
+++ b/src/backend/executor/nodeSort.c
@@ -82,40 +82,6 @@ ExecSort(SortState *node)
outerNode = outerPlanState(node);
tupDesc = ExecGetResultType(outerNode);
-#ifdef PGXC
- if (plannode->srt_start_merge &&
- IsA(node->ss.ps.lefttree, RemoteQueryState))
- {
- RemoteQueryState *rqs = (RemoteQueryState *)node->ss.ps.lefttree;
-
- rqs->rqs_for_sort = true;
- /*
- * Start the queries on all the nodes. That way we get the number of
- * connections and connection handlers set in RemoteQueryState.
- * Those will be used to merge the data from the datanodes.
- */
- if (!rqs->query_Done)
- {
- do_query(rqs);
- rqs->query_Done = true;
- }
-
- /*
- * PGXCTODO: We don't handle bounded in this case, but see if it can
- * be used.
- */
- tuplesortstate = tuplesort_begin_merge(tupDesc,
- plannode->numCols,
- plannode->sortColIdx,
- plannode->sortOperators,
- plannode->collations,
- plannode->nullsFirst,
- rqs, work_mem);
-
- }
- else
- {
-#endif /* PGXC */
tuplesortstate = tuplesort_begin_heap(tupDesc,
plannode->numCols,
plannode->sortColIdx,
@@ -126,15 +92,8 @@ ExecSort(SortState *node)
node->randomAccess);
if (node->bounded)
tuplesort_set_bound(tuplesortstate, node->bound);
-#ifdef PGXC
- }
-#endif /* PGXC */
node->tuplesortstate = (void *) tuplesortstate;
-#ifdef PGXC
- if (!plannode->srt_start_merge)
- {
-#endif /* PGXC */
/*
* Scan the subplan and feed all the tuples to tuplesort.
*/
@@ -153,11 +112,6 @@ ExecSort(SortState *node)
* Complete the sort.
*/
tuplesort_performsort(tuplesortstate);
-#ifdef PGXC
- }
- else
- Assert(IsA(node->ss.ps.lefttree, RemoteQueryState));
-#endif /* PGXC */
/*
* restore to user specified direction
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index da31820e2d..f6fb1c955f 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -3,6 +3,11 @@
* nodeSubplan.c
* routines to support subselects
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -660,6 +665,11 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
sstate->planstate = (PlanState *) list_nth(estate->es_subplanstates,
subplan->plan_id - 1);
+#ifdef XCP
+ /* subplan is referenced on local node, finish initialization */
+ ExecFinishInitProcNode(sstate->planstate);
+#endif
+
/* Initialize subexpressions */
sstate->testexpr = ExecInitExpr((Expr *) subplan->testexpr, parent);
sstate->args = (List *) ExecInitExpr((Expr *) subplan->args, parent);
diff --git a/src/backend/executor/nodeWindowAgg.c b/src/backend/executor/nodeWindowAgg.c
index ce89ff2ebf..43d4581cd1 100644
--- a/src/backend/executor/nodeWindowAgg.c
+++ b/src/backend/executor/nodeWindowAgg.c
@@ -23,6 +23,11 @@
* aggregate function over all rows in the current row's window frame.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -1717,10 +1722,19 @@ initialize_peragg(WindowAggState *winstate, WindowFunc *wfunc,
HeapTuple aggTuple;
Form_pg_aggregate aggform;
Oid aggtranstype;
+#ifdef XCP
+ Oid aggcollecttype;
+#endif
AclResult aclresult;
Oid transfn_oid,
+#ifdef XCP
+ collectfn_oid,
+#endif
finalfn_oid;
Expr *transfnexpr,
+#ifdef XCP
+ *collectfnexpr,
+#endif
*finalfnexpr;
Datum textInitVal;
int i;
@@ -1746,6 +1760,9 @@ initialize_peragg(WindowAggState *winstate, WindowFunc *wfunc,
*/
peraggstate->transfn_oid = transfn_oid = aggform->aggtransfn;
+#ifdef XCP
+ collectfn_oid = aggform->aggcollectfn;
+#endif
peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn;
/* Check that aggregate owner has permission to call component fns */
@@ -1794,16 +1811,28 @@ initialize_peragg(WindowAggState *winstate, WindowFunc *wfunc,
false);
pfree(declaredArgTypes);
}
+#ifdef XCP
+ aggcollecttype = aggform->aggcollecttype;
+#endif
/* build expression trees using actual argument & result types */
build_aggregate_fnexprs(inputTypes,
numArguments,
aggtranstype,
+#ifdef XCP
+ aggcollecttype,
+#endif
wfunc->wintype,
wfunc->inputcollid,
transfn_oid,
+#ifdef XCP
+ collectfn_oid,
+#endif
finalfn_oid,
&transfnexpr,
+#ifdef XCP
+ &collectfnexpr,
+#endif
&finalfnexpr);
fmgr_info(transfn_oid, &peraggstate->transfn);
diff --git a/src/backend/executor/producerReceiver.c b/src/backend/executor/producerReceiver.c
new file mode 100644
index 0000000000..b7339f16c6
--- /dev/null
+++ b/src/backend/executor/producerReceiver.c
@@ -0,0 +1,290 @@
+/*-------------------------------------------------------------------------
+ *
+ * producerReceiver.c
+ * An implementation of DestReceiver that distributes the result tuples to
+ * multiple customers via a SharedQueue.
+ *
+ *
+ * Copyright (c) 2012-2014, TransLattice, Inc.
+ *
+ * IDENTIFICATION
+ * src/backend/executor/producerReceiver.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "executor/producerReceiver.h"
+#include "pgxc/nodemgr.h"
+#include "tcop/pquery.h"
+#include "utils/tuplestore.h"
+
+typedef struct
+{
+ DestReceiver pub;
+ /* parameters: */
+ DestReceiver *consumer; /* where to put the tuples for self */
+ AttrNumber distKey; /* distribution key attribute in the tuple */
+ Locator *locator; /* locator is determining destination nodes */
+ int *distNodes; /* array where to get locator results */
+ int *consMap; /* map of consumers: consMap[node-1] indicates
+ * the target consumer */
+ SharedQueue squeue; /* a SharedQueue for result distribution */
+ MemoryContext tmpcxt; /* holds temporary data */
+ Tuplestorestate **tstores; /* storage to buffer data if destination queue
+ * is full */
+ TupleDesc typeinfo; /* description of received tuples */
+ long tcount;
+ long selfcount;
+ long othercount;
+} ProducerState;
+
+
+/*
+ * Prepare to receive tuples from executor.
+ */
+static void
+producerStartupReceiver(DestReceiver *self, int operation, TupleDesc typeinfo)
+{
+ ProducerState *myState = (ProducerState *) self;
+
+ if (ActivePortal)
+ {
+ /* Normally ExecutorContext is current here. However we should better
+ * create local producer storage in the Portal's context: producer
+ * may keep pushing records to consumers after executor is destroyed.
+ */
+ MemoryContext savecontext;
+ savecontext = MemoryContextSwitchTo(PortalGetHeapMemory(ActivePortal));
+ myState->typeinfo = CreateTupleDescCopy(typeinfo);
+ MemoryContextSwitchTo(savecontext);
+ }
+ else
+ myState->typeinfo = typeinfo;
+
+ if (myState->consumer)
+ (*myState->consumer->rStartup) (myState->consumer, operation, typeinfo);
+}
+
+/*
+ * Receive a tuple from the executor and dispatch it to the proper consumer
+ */
+static void
+producerReceiveSlot(TupleTableSlot *slot, DestReceiver *self)
+{
+ ProducerState *myState = (ProducerState *) self;
+ Datum value;
+ bool isnull;
+ int ncount, i;
+
+ if (myState->distKey == InvalidAttrNumber)
+ {
+ value = (Datum) 0;
+ isnull = true;
+ }
+ else
+ value = slot_getattr(slot, myState->distKey, &isnull);
+ ncount = GET_NODES(myState->locator, value, isnull, NULL);
+
+ myState->tcount++;
+ /* Dispatch the tuple */
+ for (i = 0; i < ncount; i++)
+ {
+ int consumerIdx = myState->distNodes[i];
+
+ if (consumerIdx == SQ_CONS_NONE)
+ {
+ continue;
+ }
+ else if (consumerIdx == SQ_CONS_SELF)
+ {
+ Assert(myState->consumer);
+ (*myState->consumer->receiveSlot) (slot, myState->consumer);
+ myState->selfcount++;
+ }
+ else if (myState->squeue)
+ {
+ /*
+ * If the tuple will not fit to the consumer queue it will be stored
+ * in the local tuplestore. The tuplestore should be in the portal
+ * context, because ExecutorContext may be destroyed when tuples
+ * are not yet pushed to the consumer queue.
+ */
+ MemoryContext savecontext;
+ Assert(ActivePortal);
+ savecontext = MemoryContextSwitchTo(PortalGetHeapMemory(ActivePortal));
+ SharedQueueWrite(myState->squeue, consumerIdx, slot,
+ &myState->tstores[consumerIdx], myState->tmpcxt);
+ MemoryContextSwitchTo(savecontext);
+ myState->othercount++;
+ }
+ }
+}
+
+
+/*
+ * Clean up at end of an executor run
+ */
+static void
+producerShutdownReceiver(DestReceiver *self)
+{
+ ProducerState *myState = (ProducerState *) self;
+
+ if (myState->consumer)
+ (*myState->consumer->rShutdown) (myState->consumer);
+}
+
+
+/*
+ * Destroy receiver when done with it
+ */
+static void
+producerDestroyReceiver(DestReceiver *self)
+{
+ ProducerState *myState = (ProducerState *) self;
+
+ elog(LOG, "Producer stats: total %ld tuples, %ld tuples to self, %ld to other nodes",
+ myState->tcount, myState->selfcount, myState->othercount);
+
+ if (myState->consumer)
+ (*myState->consumer->rDestroy) (myState->consumer);
+
+ /* Make sure all data are in the squeue */
+ while (myState->tstores)
+ {
+ if (SharedQueueFinish(myState->squeue, myState->typeinfo,
+ myState->tstores) == 0)
+ {
+ pfree(myState->tstores);
+ myState->tstores = NULL;
+ }
+ else
+ {
+ pg_usleep(10000l);
+ /*
+ * Do not wait for consumers that was not even connected after 10
+ * seconds after start waiting for their disconnection.
+ * That should help to break the loop which would otherwise endless.
+ * The error will be emitted later in SharedQueueUnBind
+ */
+ SharedQueueResetNotConnected(myState->squeue);
+ }
+ }
+
+ /* wait while consumer are finishing and release shared resources */
+ if (myState->squeue)
+ SharedQueueUnBind(myState->squeue);
+ myState->squeue = NULL;
+
+ /* Release workspace if any */
+ if (myState->locator)
+ freeLocator(myState->locator);
+ pfree(myState);
+}
+
+
+/*
+ * Initially create a DestReceiver object.
+ */
+DestReceiver *
+CreateProducerDestReceiver(void)
+{
+ ProducerState *self = (ProducerState *) palloc0(sizeof(ProducerState));
+
+ self->pub.receiveSlot = producerReceiveSlot;
+ self->pub.rStartup = producerStartupReceiver;
+ self->pub.rShutdown = producerShutdownReceiver;
+ self->pub.rDestroy = producerDestroyReceiver;
+ self->pub.mydest = DestProducer;
+
+ /* private fields will be set by SetTuplestoreDestReceiverParams */
+ self->tcount = 0;
+ self->selfcount = 0;
+ self->othercount = 0;
+
+ return (DestReceiver *) self;
+}
+
+
+/*
+ * Set parameters for a ProducerDestReceiver
+ */
+void
+SetProducerDestReceiverParams(DestReceiver *self,
+ AttrNumber distKey,
+ Locator *locator,
+ SharedQueue squeue)
+{
+ ProducerState *myState = (ProducerState *) self;
+
+ Assert(myState->pub.mydest == DestProducer);
+ myState->distKey = distKey;
+ myState->locator = locator;
+ myState->squeue = squeue;
+ myState->typeinfo = NULL;
+ myState->tmpcxt = NULL;
+ /* Create workspace */
+ myState->distNodes = (int *) getLocatorResults(locator);
+ if (squeue)
+ myState->tstores = (Tuplestorestate **)
+ palloc0(NumDataNodes * sizeof(Tuplestorestate *));
+}
+
+
+/*
+ * Set a DestReceiver to receive tuples targeted to "self".
+ * Returns old value of the self consumer
+ */
+DestReceiver *
+SetSelfConsumerDestReceiver(DestReceiver *self,
+ DestReceiver *consumer)
+{
+ ProducerState *myState = (ProducerState *) self;
+ DestReceiver *oldconsumer;
+
+ Assert(myState->pub.mydest == DestProducer);
+ oldconsumer = myState->consumer;
+ myState->consumer = consumer;
+ return oldconsumer;
+}
+
+
+/*
+ * Set a memory context to hold temporary data
+ */
+void
+SetProducerTempMemory(DestReceiver *self, MemoryContext tmpcxt)
+{
+ ProducerState *myState = (ProducerState *) self;
+ DestReceiver *oldconsumer;
+
+ Assert(myState->pub.mydest == DestProducer);
+ myState->tmpcxt = tmpcxt;
+}
+
+
+/*
+ * Push data from the local tuplestores to the shared memory so consumers can
+ * read them. Returns true if all data are pushed, false if something remains
+ * in the tuplestores yet.
+ */
+bool
+ProducerReceiverPushBuffers(DestReceiver *self)
+{
+ ProducerState *myState = (ProducerState *) self;
+
+ Assert(myState->pub.mydest == DestProducer);
+ if (myState->tstores)
+ {
+ if (SharedQueueFinish(myState->squeue, myState->typeinfo,
+ myState->tstores) == 0)
+ {
+ pfree(myState->tstores);
+ myState->tstores = NULL;
+ }
+ else
+ return false;
+ }
+ return true;
+}
diff --git a/src/backend/libpq/be-fsstubs.c b/src/backend/libpq/be-fsstubs.c
index 0bc90cb59a..dbdb8be2b6 100644
--- a/src/backend/libpq/be-fsstubs.c
+++ b/src/backend/libpq/be-fsstubs.c
@@ -101,11 +101,18 @@ lo_open(PG_FUNCTION_ARGS)
int fd;
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
#if FSDB
elog(DEBUG4, "lo_open(%u,%d)", lobjId, mode);
@@ -134,11 +141,18 @@ lo_close(PG_FUNCTION_ARGS)
int32 fd = PG_GETARG_INT32(0);
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL)
ereport(ERROR,
@@ -171,11 +185,18 @@ lo_read(int fd, char *buf, int len)
int status;
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL)
ereport(ERROR,
@@ -204,11 +225,18 @@ lo_write(int fd, const char *buf, int len)
int status;
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL)
ereport(ERROR,
@@ -247,11 +275,18 @@ lo_lseek(PG_FUNCTION_ARGS)
int status;
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL)
ereport(ERROR,
@@ -269,11 +304,18 @@ lo_creat(PG_FUNCTION_ARGS)
Oid lobjId;
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
/*
* We don't actually need to store into fscxt, but create it anyway to
@@ -292,11 +334,18 @@ lo_create(PG_FUNCTION_ARGS)
Oid lobjId = PG_GETARG_OID(0);
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
/*
* We don't actually need to store into fscxt, but create it anyway to
@@ -315,11 +364,18 @@ lo_tell(PG_FUNCTION_ARGS)
int32 fd = PG_GETARG_INT32(0);
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL)
ereport(ERROR,
@@ -335,11 +391,18 @@ lo_unlink(PG_FUNCTION_ARGS)
Oid lobjId = PG_GETARG_OID(0);
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
/* Must be owner of the largeobject */
if (!lo_compat_privileges &&
@@ -385,11 +448,18 @@ loread(PG_FUNCTION_ARGS)
int totalread;
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
if (len < 0)
len = 0;
@@ -410,11 +480,18 @@ lowrite(PG_FUNCTION_ARGS)
int totalwritten;
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
bytestowrite = VARSIZE(wbuf) - VARHDRSZ;
totalwritten = lo_write(fd, VARDATA(wbuf), bytestowrite);
@@ -435,11 +512,18 @@ lo_import(PG_FUNCTION_ARGS)
text *filename = PG_GETARG_TEXT_PP(0);
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
PG_RETURN_OID(lo_import_internal(filename, InvalidOid));
}
@@ -455,11 +539,18 @@ lo_import_with_oid(PG_FUNCTION_ARGS)
Oid oid = PG_GETARG_OID(1);
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
PG_RETURN_OID(lo_import_internal(filename, oid));
}
@@ -542,11 +633,18 @@ lo_export(PG_FUNCTION_ARGS)
mode_t oumask;
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
#ifndef ALLOW_DANGEROUS_LO_FUNCTIONS
if (!superuser())
@@ -611,11 +709,18 @@ lo_truncate(PG_FUNCTION_ARGS)
int32 len = PG_GETARG_INT32(1);
#ifdef PGXC
+#ifdef XCP
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Postgres-XL does not yet support large objects"),
+ errdetail("The feature is not currently supported")));
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("Postgres-XC does not support large object yet"),
errdetail("The feature is not currently supported")));
#endif
+#endif
if (fd < 0 || fd >= cookies_size || cookies[fd] == NULL)
ereport(ERROR,
diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c
index 828f6dcc8e..7a84dc24d5 100644
--- a/src/backend/libpq/hba.c
+++ b/src/backend/libpq/hba.c
@@ -5,6 +5,11 @@
* wherein you authenticate a user by seeing what IP address the system
* says he comes from and choosing authentication method based on it).
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -2092,3 +2097,91 @@ hba_getauthmethod(hbaPort *port)
{
check_hba(port);
}
+
+#ifdef XCP
+/*
+ * NB the only way to free allocated lines is to reset or delete current memory
+ * context, so caller is responsible for setting it up properly to avoid leak.
+ * However, if function fails it would release working memory.
+ * Basically the function does the same as load_hba(), but it does not set
+ * the static variables.
+ */
+List* get_parsed_hba(void) {
+ FILE *file;
+ List *hba_lines = NIL;
+ List *hba_line_nums = NIL;
+ ListCell *line,
+ *line_num;
+ List *new_parsed_lines = NIL;
+ bool ok = true;
+ MemoryContext linecxt;
+ MemoryContext oldcxt;
+ MemoryContext hbacxt;
+
+ file = AllocateFile(HbaFileName, "r");
+ if (file == NULL)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open configuration file \"%s\": %m",
+ HbaFileName)));
+
+ /*
+ * Caller will take care of making this a FATAL error in case this is
+ * the initial startup. If it happens on reload, we just keep the old
+ * version around.
+ */
+ return false;
+ }
+
+ linecxt = tokenize_file(HbaFileName, file, &hba_lines, &hba_line_nums);
+ FreeFile(file);
+
+ /* Now parse all the lines */
+ hbacxt = AllocSetContextCreate(CurrentMemoryContext,
+ "hba parser context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ oldcxt = MemoryContextSwitchTo(hbacxt);
+ forboth(line, hba_lines, line_num, hba_line_nums)
+ {
+ HbaLine *newline;
+
+ if ((newline = parse_hba_line(lfirst(line), lfirst_int(line_num))) == NULL)
+ {
+ /*
+ * Parse error in the file, so indicate there's a problem. NB: a
+ * problem in a line will free the memory for all previous lines as
+ * well!
+ */
+ MemoryContextReset(hbacxt);
+ new_parsed_lines = NIL;
+ ok = false;
+
+ /*
+ * Keep parsing the rest of the file so we can report errors on
+ * more than the first row. Error has already been reported in the
+ * parsing function, so no need to log it here.
+ */
+ continue;
+ }
+
+ new_parsed_lines = lappend(new_parsed_lines, newline);
+ }
+
+ /* Free tokenizer memory */
+ MemoryContextDelete(linecxt);
+ MemoryContextSwitchTo(oldcxt);
+
+ if (!ok)
+ {
+ /* Parsing failed at one or more rows, so bail out */
+ MemoryContextDelete(hbacxt);
+ return NIL;
+ }
+
+ /* Loaded new file successfully, return */
+ return parsed_hba_lines;
+}
+#endif
diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index 483a956434..65091479a4 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -328,6 +328,7 @@ help(const char *progname)
printf(_("\nNode options:\n"));
printf(_(" --coordinator start as a Coordinator\n"));
printf(_(" --datanode start as a Datanode\n"));
+ printf(_(" --restoremode start to restore existing schema on the new node to be added\n"));
#endif
printf(_("\nPlease read the documentation for the complete list of run-time\n"
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 7eeb5b9af8..46c9940bd7 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -11,6 +11,11 @@
* be handled easily in a simple depth-first traversal.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -28,7 +33,10 @@
#include "nodes/relation.h"
#ifdef PGXC
#include "pgxc/locator.h"
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
+#endif
+#ifdef XCP
+#include "pgxc/execRemote.h"
#endif
#include "utils/datum.h"
@@ -98,7 +106,16 @@ _copyPlannedStmt(const PlannedStmt *from)
COPY_NODE_FIELD(relationOids);
COPY_NODE_FIELD(invalItems);
COPY_SCALAR_FIELD(nParamExec);
-
+#ifdef XCP
+ COPY_SCALAR_FIELD(nParamRemote);
+ COPY_POINTER_FIELD(remoteparams,
+ newnode->nParamRemote * sizeof(RemoteParam));
+ COPY_STRING_FIELD(pname);
+ COPY_SCALAR_FIELD(distributionType);
+ COPY_SCALAR_FIELD(distributionKey);
+ COPY_NODE_FIELD(distributionNodes);
+ COPY_NODE_FIELD(distributionRestrict);
+#endif
return newnode;
}
@@ -187,8 +204,10 @@ _copyModifyTable(const ModifyTable *from)
COPY_NODE_FIELD(rowMarks);
COPY_SCALAR_FIELD(epqParam);
#ifdef PGXC
+#ifndef XCP
COPY_NODE_FIELD(remote_plans);
#endif
+#endif
return newnode;
}
@@ -781,6 +800,9 @@ _copyAgg(const Agg *from)
CopyPlanFields((const Plan *) from, (Plan *) newnode);
COPY_SCALAR_FIELD(aggstrategy);
+#ifdef XCP
+ COPY_SCALAR_FIELD(aggdistribution);
+#endif
COPY_SCALAR_FIELD(numCols);
if (from->numCols > 0)
{
@@ -1023,27 +1045,32 @@ _copyRemoteQuery(const RemoteQuery *from)
COPY_STRING_FIELD(sql_statement);
COPY_NODE_FIELD(exec_nodes);
COPY_SCALAR_FIELD(combine_type);
+ COPY_NODE_FIELD(sort);
COPY_SCALAR_FIELD(read_only);
COPY_SCALAR_FIELD(force_autocommit);
COPY_STRING_FIELD(statement);
COPY_STRING_FIELD(cursor);
- COPY_SCALAR_FIELD(rq_num_params);
- if (from->rq_param_types)
- COPY_POINTER_FIELD(rq_param_types,
- sizeof(from->rq_param_types[0]) * from->rq_num_params);
- else
- newnode->rq_param_types = NULL;
+ COPY_SCALAR_FIELD(remote_num_params);
+ COPY_POINTER_FIELD(remote_param_types,
+ sizeof(from->remote_param_types[0]) * from->remote_num_params);
COPY_SCALAR_FIELD(exec_type);
+#ifndef XCP
COPY_SCALAR_FIELD(is_temp);
- COPY_SCALAR_FIELD(rq_finalise_aggs);
- COPY_SCALAR_FIELD(rq_sortgroup_colno);
- COPY_NODE_FIELD(remote_query);
+#endif
+
+ COPY_SCALAR_FIELD(reduce_level);
COPY_NODE_FIELD(base_tlist);
- COPY_NODE_FIELD(coord_var_tlist);
- COPY_NODE_FIELD(query_var_tlist);
+ COPY_STRING_FIELD(outer_alias);
+ COPY_STRING_FIELD(inner_alias);
+ COPY_SCALAR_FIELD(outer_reduce_level);
+ COPY_SCALAR_FIELD(inner_reduce_level);
+ COPY_BITMAPSET_FIELD(outer_relids);
+ COPY_BITMAPSET_FIELD(inner_relids);
+ COPY_STRING_FIELD(inner_statement);
+ COPY_STRING_FIELD(outer_statement);
+ COPY_STRING_FIELD(join_condition);
COPY_SCALAR_FIELD(has_row_marks);
COPY_SCALAR_FIELD(has_ins_child_sel_parent);
- COPY_SCALAR_FIELD(rq_params_internal);
return newnode;
}
@@ -1079,6 +1106,7 @@ _copySimpleSort(const SimpleSort *from)
{
COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber));
COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid));
+ COPY_POINTER_FIELD(sortCollations, from->numCols * sizeof(Oid));
COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool));
}
@@ -1086,6 +1114,55 @@ _copySimpleSort(const SimpleSort *from)
}
#endif
+
+#ifdef XCP
+/*
+ * _copyRemoteSubplan
+ */
+static RemoteSubplan *
+_copyRemoteSubplan(const RemoteSubplan *from)
+{
+ RemoteSubplan *newnode = makeNode(RemoteSubplan);
+
+ /*
+ * copy node superclass fields
+ */
+ CopyScanFields((Scan *) from, (Scan *) newnode);
+
+ /*
+ * copy remainder of node
+ */
+ COPY_SCALAR_FIELD(distributionType);
+ COPY_SCALAR_FIELD(distributionKey);
+ COPY_NODE_FIELD(distributionNodes);
+ COPY_NODE_FIELD(distributionRestrict);
+ COPY_NODE_FIELD(nodeList);
+ COPY_SCALAR_FIELD(execOnAll);
+ COPY_NODE_FIELD(sort);
+ COPY_STRING_FIELD(cursor);
+ COPY_SCALAR_FIELD(unique);
+
+ return newnode;
+}
+
+/*
+ * _copyDistribution
+ */
+static Distribution *
+_copyDistribution(const Distribution *from)
+{
+ Distribution *newnode = makeNode(Distribution);
+
+ COPY_SCALAR_FIELD(distributionType);
+ COPY_NODE_FIELD(distributionExpr);
+ COPY_BITMAPSET_FIELD(nodes);
+ COPY_BITMAPSET_FIELD(restrictNodes);
+
+ return newnode;
+}
+#endif
+
+
/* ****************************************************************
* primnodes.h copy functions
* ****************************************************************
@@ -1241,8 +1318,10 @@ _copyAggref(const Aggref *from)
COPY_SCALAR_FIELD(aggfnoid);
COPY_SCALAR_FIELD(aggtype);
#ifdef PGXC
+#ifndef XCP
COPY_SCALAR_FIELD(aggtrantype);
COPY_SCALAR_FIELD(agghas_collectfn);
+#endif /* XCP */
#endif /* PGXC */
COPY_SCALAR_FIELD(aggcollid);
COPY_SCALAR_FIELD(inputcollid);
@@ -2069,8 +2148,10 @@ _copyRangeTblEntry(const RangeTblEntry *from)
COPY_SCALAR_FIELD(rtekind);
#ifdef PGXC
+#ifndef XCP
COPY_STRING_FIELD(relname);
#endif
+#endif
COPY_SCALAR_FIELD(relid);
COPY_SCALAR_FIELD(relkind);
@@ -2554,9 +2635,11 @@ _copyQuery(const Query *from)
COPY_NODE_FIELD(setOperations);
COPY_NODE_FIELD(constraintDeps);
#ifdef PGXC
+#ifndef XCP
COPY_STRING_FIELD(sql_statement);
COPY_SCALAR_FIELD(is_ins_child_sel_parent);
#endif
+#endif
return newnode;
}
@@ -3926,6 +4009,17 @@ _copyBarrierStmt(const BarrierStmt *from)
return newnode;
}
+#ifdef XCP
+static PauseClusterStmt *
+_copyPauseClusterStmt(const PauseClusterStmt *from)
+{
+ PauseClusterStmt *newnode = makeNode(PauseClusterStmt);
+
+ COPY_SCALAR_FIELD(pause);
+
+ return newnode;
+}
+#endif
/* ****************************************************************
* nodemgr.h copy functions
* ****************************************************************
@@ -4162,6 +4256,14 @@ copyObject(const void *from)
retval = _copySimpleSort(from);
break;
#endif
+#ifdef XCP
+ case T_RemoteSubplan:
+ retval = _copyRemoteSubplan(from);
+ break;
+ case T_Distribution:
+ retval = _copyDistribution(from);
+ break;
+#endif
/*
* PRIMITIVE NODES
*/
@@ -4609,6 +4711,11 @@ copyObject(const void *from)
case T_BarrierStmt:
retval = _copyBarrierStmt(from);
break;
+#ifdef XCP
+ case T_PauseClusterStmt:
+ retval = _copyPauseClusterStmt(from);
+ break;
+#endif
case T_AlterNodeStmt:
retval = _copyAlterNodeStmt(from);
break;
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 34885297bf..6817fe73a1 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -18,6 +18,11 @@
* "x" to be considered equal() to another reference to "x" in the query.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -186,11 +191,17 @@ static bool
_equalAggref(const Aggref *a, const Aggref *b)
{
COMPARE_SCALAR_FIELD(aggfnoid);
+#ifndef XCP
+ /*
+ * In XCP ignore aggtype difference because Phase 1 of aggregate have
+ * aggtype set to aggtrantype
+ */
COMPARE_SCALAR_FIELD(aggtype);
#ifdef PGXC
COMPARE_SCALAR_FIELD(aggtrantype);
COMPARE_SCALAR_FIELD(agghas_collectfn);
#endif /* PGXC */
+#endif /* XCP */
COMPARE_SCALAR_FIELD(aggcollid);
COMPARE_SCALAR_FIELD(inputcollid);
COMPARE_NODE_FIELD(args);
@@ -931,8 +942,10 @@ _equalQuery(const Query *a, const Query *b)
COMPARE_NODE_FIELD(constraintDeps);
#ifdef PGXC
+#ifndef XCP
COMPARE_SCALAR_FIELD(is_ins_child_sel_parent);
#endif
+#endif
return true;
}
@@ -2366,6 +2379,18 @@ _equalXmlSerialize(const XmlSerialize *a, const XmlSerialize *b)
return true;
}
+#ifdef XCP
+static bool
+_equalDistribution(Distribution *a, Distribution *b)
+{
+ COMPARE_SCALAR_FIELD(distributionType);
+ COMPARE_NODE_FIELD(distributionExpr);
+ COMPARE_BITMAPSET_FIELD(nodes);
+
+ return true;
+}
+#endif
+
/*
* Stuff from pg_list.h
*/
@@ -2467,6 +2492,17 @@ _equalBarrierStmt(const BarrierStmt *a, const BarrierStmt *b)
return true;
}
+#ifdef XCP
+/*
+ * Lock Cluster stuff
+ */
+static bool
+_equalPauseClusterStmt(PauseClusterStmt *a, PauseClusterStmt *b)
+{
+ COMPARE_SCALAR_FIELD(pause);
+ return true;
+}
+#endif
/*
* stuff from nodemgr.h
*/
@@ -2989,6 +3025,11 @@ equal(const void *a, const void *b)
case T_BarrierStmt:
retval = _equalBarrierStmt(a, b);
break;
+#ifdef XCP
+ case T_PauseClusterStmt:
+ retval = _equalPauseClusterStmt(a, b);
+ break;
+#endif
case T_AlterNodeStmt:
retval = _equalAlterNodeStmt(a, b);
break;
@@ -3135,6 +3176,11 @@ equal(const void *a, const void *b)
case T_XmlSerialize:
retval = _equalXmlSerialize(a, b);
break;
+#ifdef XCP
+ case T_Distribution:
+ retval = _equalDistribution(a, b);
+ break;
+#endif
default:
elog(ERROR, "unrecognized node type: %d",
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 080047c8e7..efb751176f 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -3,6 +3,11 @@
* outfuncs.c
* Output functions for Postgres tree nodes.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -26,9 +31,35 @@
#include "lib/stringinfo.h"
#include "nodes/plannodes.h"
#include "nodes/relation.h"
+#ifdef XCP
+#include "fmgr.h"
+#include "miscadmin.h"
+#include "catalog/namespace.h"
+#include "pgxc/execRemote.h"
+#include "utils/lsyscache.h"
+#endif
#include "utils/datum.h"
#ifdef PGXC
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
+#endif
+
+#ifdef XCP
+/*
+ * When we sending query plans between nodes we need to send OIDs of various
+ * objects - relations, data types, functions, etc.
+ * On different nodes OIDs of these objects may differ, so we need to send an
+ * identifier, depending on object type, allowing to lookup OID on target node.
+ * On the other hand we want to save space when storing rules, or in other cases
+ * when we need to encode and decode nodes on the same node.
+ * For now default format is not portable, as it is in original Postgres code.
+ * Later we may want to add extra parameter in nodeToString() function
+ */
+static bool portable_output = false;
+void
+set_portable_output(bool value)
+{
+ portable_output = value;
+}
#endif
@@ -51,9 +82,16 @@
#define WRITE_UINT_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)
+#ifdef XCP
+/* Only allow output OIDs in not portable mode */
+#define WRITE_OID_FIELD(fldname) \
+ (AssertMacro(!portable_output), \
+ appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname))
+#else
/* Write an OID field (don't hard-wire assumption that OID is same as uint) */
#define WRITE_OID_FIELD(fldname) \
appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname)
+#endif
/* Write a long-integer field */
#define WRITE_LONG_FIELD(fldname) \
@@ -96,6 +134,98 @@
(appendStringInfo(str, " :" CppAsString(fldname) " "), \
_outBitmapset(str, node->fldname))
+#ifdef XCP
+#define NSP_NAME(oid) \
+ isTempNamespace(oid) ? "pg_temp" : get_namespace_name(oid)
+/*
+ * Macros to encode OIDs to send to other nodes. Objects on other nodes may have
+ * different OIDs, so send instead an unique identifier allowing to lookup
+ * the OID on target node. The identifier depends on object type.
+ */
+
+/* write an OID which is a relation OID */
+#define WRITE_RELID_FIELD(fldname) \
+ (appendStringInfo(str, " :" CppAsString(fldname) " "), \
+ _outToken(str, OidIsValid(node->fldname) ? NSP_NAME(get_rel_namespace(node->fldname)) : NULL), \
+ appendStringInfoChar(str, ' '), \
+ _outToken(str, OidIsValid(node->fldname) ? get_rel_name(node->fldname) : NULL))
+
+/* write an OID which is a data type OID */
+#define WRITE_TYPID_FIELD(fldname) \
+ (appendStringInfo(str, " :" CppAsString(fldname) " "), \
+ _outToken(str, OidIsValid(node->fldname) ? NSP_NAME(get_typ_namespace(node->fldname)) : NULL), \
+ appendStringInfoChar(str, ' '), \
+ _outToken(str, OidIsValid(node->fldname) ? get_typ_name(node->fldname) : NULL))
+
+/* write an OID which is a function OID */
+#define WRITE_FUNCID_FIELD(fldname) \
+ do { \
+ appendStringInfo(str, " :" CppAsString(fldname) " "); \
+ if (OidIsValid(node->fldname)) \
+ { \
+ Oid *argtypes; \
+ int i, nargs; \
+ _outToken(str, NSP_NAME(get_func_namespace(node->fldname))); \
+ appendStringInfoChar(str, ' '); \
+ _outToken(str, get_func_name(node->fldname)); \
+ appendStringInfoChar(str, ' '); \
+ get_func_signature(node->fldname, &argtypes, &nargs); \
+ appendStringInfo(str, "%d", nargs); \
+ for (i = 0; i < nargs; i++) \
+ { \
+ appendStringInfoChar(str, ' '); \
+ _outToken(str, NSP_NAME(get_typ_namespace(argtypes[i]))); \
+ appendStringInfoChar(str, ' '); \
+ _outToken(str, get_typ_name(argtypes[i])); \
+ } \
+ } \
+ else \
+ appendStringInfo(str, "<> <> 0"); \
+ } while (0)
+
+/* write an OID which is an operator OID */
+#define WRITE_OPERID_FIELD(fldname) \
+ do { \
+ appendStringInfo(str, " :" CppAsString(fldname) " "); \
+ if (OidIsValid(node->fldname)) \
+ { \
+ Oid oprleft, oprright; \
+ _outToken(str, NSP_NAME(get_opnamespace(node->fldname))); \
+ appendStringInfoChar(str, ' '); \
+ _outToken(str, get_opname(node->fldname)); \
+ appendStringInfoChar(str, ' '); \
+ op_input_types(node->fldname, &oprleft, &oprright); \
+ _outToken(str, OidIsValid(oprleft) ? \
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL); \
+ appendStringInfoChar(str, ' '); \
+ _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL); \
+ appendStringInfoChar(str, ' '); \
+ _outToken(str, OidIsValid(oprright) ? \
+ NSP_NAME(get_typ_namespace(oprright)) : NULL); \
+ appendStringInfoChar(str, ' '); \
+ _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL); \
+ appendStringInfoChar(str, ' '); \
+ } \
+ else \
+ appendStringInfo(str, "<> <> <> <> <> <>"); \
+ } while (0)
+
+/* write an OID which is a collation OID */
+#define WRITE_COLLID_FIELD(fldname) \
+ do { \
+ appendStringInfo(str, " :" CppAsString(fldname) " "); \
+ if (OidIsValid(node->fldname)) \
+ { \
+ _outToken(str, NSP_NAME(get_collation_namespace(node->fldname))); \
+ appendStringInfoChar(str, ' '); \
+ _outToken(str, get_collation_name(node->fldname)); \
+ appendStringInfo(str, " %d", get_collation_encoding(node->fldname)); \
+ } \
+ else \
+ appendStringInfo(str, "<> <> -1"); \
+ } while (0)
+
+#endif
#define booltostr(x) ((x) ? "true" : "false")
@@ -235,6 +365,48 @@ _outDatum(StringInfo str, Datum value, int typlen, bool typbyval)
}
+#ifdef XCP
+/*
+ * Output value in text format
+ */
+static void
+_printDatum(StringInfo str, Datum value, Oid typid)
+{
+ Oid typOutput;
+ bool typIsVarlena;
+ FmgrInfo finfo;
+ Datum tmpval;
+ char *textvalue;
+ int saveDateStyle;
+
+ /* Get output function for the type */
+ getTypeOutputInfo(typid, &typOutput, &typIsVarlena);
+ fmgr_info(typOutput, &finfo);
+
+ /* Detoast value if needed */
+ if (typIsVarlena)
+ tmpval = PointerGetDatum(PG_DETOAST_DATUM(value));
+ else
+ tmpval = value;
+
+ /*
+ * It was found that if configuration setting for date style is
+ * "postgres,ymd" the output dates have format DD-MM-YYYY and they can not
+ * be parsed correctly by receiving party. So force ISO format YYYY-MM-DD
+ * in internal cluster communications, these values are always parsed
+ * correctly.
+ */
+ saveDateStyle = DateStyle;
+ DateStyle = USE_ISO_DATES;
+
+ textvalue = DatumGetCString(FunctionCall1(&finfo, tmpval));
+ _outToken(str, textvalue);
+
+ DateStyle = saveDateStyle;
+}
+#endif
+
+
/*
* Stuff from plannodes.h
*/
@@ -339,8 +511,10 @@ _outModifyTable(StringInfo str, const ModifyTable *node)
WRITE_NODE_FIELD(rowMarks);
WRITE_INT_FIELD(epqParam);
#ifdef PGXC
+#ifndef XCP
WRITE_NODE_FIELD(remote_plans);
#endif
+#endif
}
static void
@@ -372,10 +546,52 @@ _outMergeAppend(StringInfo str, const MergeAppend *node)
appendStringInfo(str, " :sortOperators");
for (i = 0; i < node->numCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->sortOperators[i];
+ Oid oprleft, oprright;
+ /* Sort operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
+ _outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->sortOperators[i]);
appendStringInfo(str, " :collations");
for (i = 0; i < node->numCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid coll = node->collations[i];
+ if (OidIsValid(coll))
+ {
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_collation_name(coll));
+ appendStringInfo(str, " %d", get_collation_encoding(coll));
+ }
+ else
+ appendStringInfo(str, " <> <> -1");
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->collations[i]);
appendStringInfo(str, " :nullsFirst");
@@ -401,6 +617,32 @@ _outRecursiveUnion(StringInfo str, const RecursiveUnion *node)
appendStringInfo(str, " :dupOperators");
for (i = 0; i < node->numCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->dupOperators[i];
+ Oid oprleft, oprright;
+ /* Unique operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
+ _outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->dupOperators[i]);
WRITE_LONG_FIELD(numGroups);
@@ -449,6 +691,11 @@ _outIndexScan(StringInfo str, const IndexScan *node)
_outScanInfo(str, (const Scan *) node);
+#ifdef XCP
+ if (portable_output)
+ WRITE_RELID_FIELD(indexid);
+ else
+#endif
WRITE_OID_FIELD(indexid);
WRITE_NODE_FIELD(indexqual);
WRITE_NODE_FIELD(indexqualorig);
@@ -475,22 +722,18 @@ _outRemoteQuery(StringInfo str, const RemoteQuery *node)
WRITE_BOOL_FIELD(force_autocommit);
WRITE_STRING_FIELD(statement);
WRITE_STRING_FIELD(cursor);
- WRITE_INT_FIELD(rq_num_params);
+ WRITE_INT_FIELD(remote_num_params);
- appendStringInfo(str, " :rq_param_types");
- for (i = 0; i < node->rq_num_params; i++)
- appendStringInfo(str, " %d", node->rq_param_types[i]);
+ appendStringInfo(str, " :remote_param_types");
+ for (i = 0; i < node->remote_num_params; i++)
+ appendStringInfo(str, " %d", node->remote_param_types[i]);
WRITE_ENUM_FIELD(exec_type, RemoteQueryExecType);
+#ifndef XCP
WRITE_BOOL_FIELD(is_temp);
+#endif
WRITE_BOOL_FIELD(has_row_marks);
- WRITE_BOOL_FIELD(rq_finalise_aggs);
- WRITE_BOOL_FIELD(rq_sortgroup_colno);
- WRITE_NODE_FIELD(remote_query);
- WRITE_NODE_FIELD(coord_var_tlist);
- WRITE_NODE_FIELD(query_var_tlist);
WRITE_BOOL_FIELD(has_ins_child_sel_parent);
- WRITE_BOOL_FIELD(rq_params_internal);
}
static void
@@ -514,6 +757,11 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node)
_outScanInfo(str, (const Scan *) node);
+#ifdef XCP
+ if (portable_output)
+ WRITE_RELID_FIELD(indexid);
+ else
+#endif
WRITE_OID_FIELD(indexid);
WRITE_NODE_FIELD(indexqual);
WRITE_NODE_FIELD(indexorderby);
@@ -528,6 +776,11 @@ _outBitmapIndexScan(StringInfo str, const BitmapIndexScan *node)
_outScanInfo(str, (const Scan *) node);
+#ifdef XCP
+ if (portable_output)
+ WRITE_RELID_FIELD(indexid);
+ else
+#endif
WRITE_OID_FIELD(indexid);
WRITE_NODE_FIELD(indexqual);
WRITE_NODE_FIELD(indexqualorig);
@@ -658,6 +911,23 @@ _outMergeJoin(StringInfo str, const MergeJoin *node)
appendStringInfo(str, " :mergeCollations");
for (i = 0; i < numCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid coll = node->mergeCollations[i];
+ if (OidIsValid(coll))
+ {
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_collation_name(coll));
+ appendStringInfo(str, " %d", get_collation_encoding(coll));
+ }
+ else
+ appendStringInfo(str, " <> <> -1");
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->mergeCollations[i]);
appendStringInfo(str, " :mergeStrategies");
@@ -689,6 +959,9 @@ _outAgg(StringInfo str, const Agg *node)
_outPlanInfo(str, (const Plan *) node);
WRITE_ENUM_FIELD(aggstrategy, AggStrategy);
+#ifdef XCP
+ WRITE_ENUM_FIELD(aggdistribution, AggDistribution);
+#endif
WRITE_INT_FIELD(numCols);
appendStringInfo(str, " :grpColIdx");
@@ -697,6 +970,32 @@ _outAgg(StringInfo str, const Agg *node)
appendStringInfo(str, " :grpOperators");
for (i = 0; i < node->numCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->grpOperators[i];
+ Oid oprleft, oprright;
+ /* Group operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
+ _outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->grpOperators[i]);
WRITE_LONG_FIELD(numGroups);
@@ -720,6 +1019,32 @@ _outWindowAgg(StringInfo str, const WindowAgg *node)
appendStringInfo(str, " :partOperations");
for (i = 0; i < node->partNumCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->partOperators[i];
+ Oid oprleft, oprright;
+ /* The operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
+ _outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->partOperators[i]);
WRITE_INT_FIELD(ordNumCols);
@@ -730,6 +1055,32 @@ _outWindowAgg(StringInfo str, const WindowAgg *node)
appendStringInfo(str, " :ordOperations");
for (i = 0; i < node->ordNumCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->ordOperators[i];
+ Oid oprleft, oprright;
+ /* Group operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
+ _outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->ordOperators[i]);
WRITE_INT_FIELD(frameOptions);
@@ -754,6 +1105,32 @@ _outGroup(StringInfo str, const Group *node)
appendStringInfo(str, " :grpOperators");
for (i = 0; i < node->numCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->grpOperators[i];
+ Oid oprleft, oprright;
+ /* Group operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
+ _outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->grpOperators[i]);
}
@@ -782,10 +1159,52 @@ _outSort(StringInfo str, const Sort *node)
appendStringInfo(str, " :sortOperators");
for (i = 0; i < node->numCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->sortOperators[i];
+ Oid oprleft, oprright;
+ /* Sort operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
+ _outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->sortOperators[i]);
appendStringInfo(str, " :collations");
for (i = 0; i < node->numCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid coll = node->collations[i];
+ if (OidIsValid(coll))
+ {
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_collation_name(coll));
+ appendStringInfo(str, " %d", get_collation_encoding(coll));
+ }
+ else
+ appendStringInfo(str, " <> <> -1");
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->collations[i]);
appendStringInfo(str, " :nullsFirst");
@@ -810,6 +1229,32 @@ _outUnique(StringInfo str, const Unique *node)
appendStringInfo(str, " :uniqOperators");
for (i = 0; i < node->numCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->uniqOperators[i];
+ Oid oprleft, oprright;
+ /* Unique operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
+ _outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->uniqOperators[i]);
}
@@ -820,9 +1265,19 @@ _outHash(StringInfo str, const Hash *node)
_outPlanInfo(str, (const Plan *) node);
+#ifdef XCP
+ if (portable_output)
+ WRITE_RELID_FIELD(skewTable);
+ else
+#endif
WRITE_OID_FIELD(skewTable);
WRITE_INT_FIELD(skewColumn);
WRITE_BOOL_FIELD(skewInherit);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(skewColType);
+ else
+#endif
WRITE_OID_FIELD(skewColType);
WRITE_INT_FIELD(skewColTypmod);
}
@@ -846,6 +1301,32 @@ _outSetOp(StringInfo str, const SetOp *node)
appendStringInfo(str, " :dupOperators");
for (i = 0; i < node->numCols; i++)
+#ifdef XCP
+ if (portable_output)
+ {
+ Oid oper = node->dupOperators[i];
+ Oid oprleft, oprright;
+ /* Unique operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
+ _outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ appendStringInfoChar(str, ' ');
+ }
+ else
+#endif
appendStringInfo(str, " %u", node->dupOperators[i]);
WRITE_INT_FIELD(flagColIdx);
@@ -875,6 +1356,135 @@ _outLimit(StringInfo str, const Limit *node)
WRITE_NODE_FIELD(limitCount);
}
+#ifdef XCP
+static void
+_outRemoteSubplan(StringInfo str, const RemoteSubplan *node)
+{
+ WRITE_NODE_TYPE("REMOTESUBPLAN");
+
+ _outScanInfo(str, (Scan *) node);
+
+ WRITE_CHAR_FIELD(distributionType);
+ WRITE_INT_FIELD(distributionKey);
+ WRITE_NODE_FIELD(distributionNodes);
+ WRITE_NODE_FIELD(distributionRestrict);
+ WRITE_NODE_FIELD(nodeList);
+ WRITE_BOOL_FIELD(execOnAll);
+ WRITE_NODE_FIELD(sort);
+ WRITE_STRING_FIELD(cursor);
+ WRITE_INT_FIELD(unique);
+}
+
+static void
+_outRemoteStmt(StringInfo str, const RemoteStmt *node)
+{
+ int i;
+
+ WRITE_NODE_TYPE("REMOTESTMT");
+
+ WRITE_ENUM_FIELD(commandType, CmdType);
+ WRITE_BOOL_FIELD(hasReturning);
+ WRITE_NODE_FIELD(planTree);
+ WRITE_NODE_FIELD(rtable);
+ WRITE_NODE_FIELD(resultRelations);
+ WRITE_NODE_FIELD(subplans);
+ WRITE_INT_FIELD(nParamExec);
+ WRITE_INT_FIELD(nParamRemote);
+
+ for (i = 0; i < node->nParamRemote; i++)
+ {
+ RemoteParam *rparam = &(node->remoteparams[i]);
+ appendStringInfo(str, " :paramkind");
+ appendStringInfo(str, " %d", (int) rparam->paramkind);
+
+ appendStringInfo(str, " :paramid");
+ appendStringInfo(str, " %d", rparam->paramid);
+
+ appendStringInfo(str, " :paramtype");
+ if (portable_output)
+ {
+ Oid ptype = rparam->paramtype;
+ Assert(OidIsValid(ptype));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_typ_namespace(ptype)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_typ_name(ptype));
+ }
+ else
+ appendStringInfo(str, " %u", rparam->paramtype);
+ }
+ WRITE_NODE_FIELD(rowMarks);
+ WRITE_CHAR_FIELD(distributionType);
+ WRITE_INT_FIELD(distributionKey);
+ WRITE_NODE_FIELD(distributionNodes);
+ WRITE_NODE_FIELD(distributionRestrict);
+}
+
+static void
+_outSimpleSort(StringInfo str, const SimpleSort *node)
+{
+ int i;
+
+ WRITE_NODE_TYPE("SIMPLESORT");
+
+ WRITE_INT_FIELD(numCols);
+
+ appendStringInfo(str, " :sortColIdx");
+ for (i = 0; i < node->numCols; i++)
+ appendStringInfo(str, " %d", node->sortColIdx[i]);
+
+ appendStringInfo(str, " :sortOperators");
+ for (i = 0; i < node->numCols; i++)
+ if (portable_output)
+ {
+ Oid oper = node->sortOperators[i];
+ Oid oprleft, oprright;
+ /* Sort operator is always valid */
+ Assert(OidIsValid(oper));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_opnamespace(oper)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_opname(oper));
+ appendStringInfoChar(str, ' ');
+ op_input_types(oper, &oprleft, &oprright);
+ _outToken(str, OidIsValid(oprleft) ?
+ NSP_NAME(get_typ_namespace(oprleft)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprleft) ? get_typ_name(oprleft) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ?
+ NSP_NAME(get_typ_namespace(oprright)) : NULL);
+ appendStringInfoChar(str, ' ');
+ _outToken(str, OidIsValid(oprright) ? get_typ_name(oprright) : NULL);
+ }
+ else
+ appendStringInfo(str, " %u", node->sortOperators[i]);
+
+ appendStringInfo(str, " :sortCollations");
+ for (i = 0; i < node->numCols; i++)
+ if (portable_output)
+ {
+ Oid coll = node->sortCollations[i];
+ if (OidIsValid(coll))
+ {
+ appendStringInfoChar(str, ' ');
+ _outToken(str, NSP_NAME(get_collation_namespace(coll)));
+ appendStringInfoChar(str, ' ');
+ _outToken(str, get_collation_name(coll));
+ appendStringInfo(str, " %d", get_collation_encoding(coll));
+ }
+ else
+ appendStringInfo(str, " <> <> -1");
+ }
+ else
+ appendStringInfo(str, " %u", node->sortCollations[i]);
+
+ appendStringInfo(str, " :nullsFirst");
+ for (i = 0; i < node->numCols; i++)
+ appendStringInfo(str, " %s", booltostr(node->nullsFirst[i]));
+}
+#endif
+
static void
_outNestLoopParam(StringInfo str, const NestLoopParam *node)
{
@@ -958,8 +1568,18 @@ _outVar(StringInfo str, const Var *node)
WRITE_UINT_FIELD(varno);
WRITE_INT_FIELD(varattno);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(vartype);
+ else
+#endif
WRITE_OID_FIELD(vartype);
WRITE_INT_FIELD(vartypmod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(varcollid);
+ else
+#endif
WRITE_OID_FIELD(varcollid);
WRITE_UINT_FIELD(varlevelsup);
WRITE_UINT_FIELD(varnoold);
@@ -972,8 +1592,18 @@ _outConst(StringInfo str, const Const *node)
{
WRITE_NODE_TYPE("CONST");
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(consttype);
+ else
+#endif
WRITE_OID_FIELD(consttype);
WRITE_INT_FIELD(consttypmod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(constcollid);
+ else
+#endif
WRITE_OID_FIELD(constcollid);
WRITE_INT_FIELD(constlen);
WRITE_BOOL_FIELD(constbyval);
@@ -984,6 +1614,11 @@ _outConst(StringInfo str, const Const *node)
if (node->constisnull)
appendStringInfo(str, "<>");
else
+#ifdef XCP
+ if (portable_output)
+ _printDatum(str, node->constvalue, node->consttype);
+ else
+#endif
_outDatum(str, node->constvalue, node->constlen, node->constbyval);
}
@@ -994,8 +1629,18 @@ _outParam(StringInfo str, const Param *node)
WRITE_ENUM_FIELD(paramkind, ParamKind);
WRITE_INT_FIELD(paramid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(paramtype);
+ else
+#endif
WRITE_OID_FIELD(paramtype);
WRITE_INT_FIELD(paramtypmod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(paramcollid);
+ else
+#endif
WRITE_OID_FIELD(paramcollid);
WRITE_LOCATION_FIELD(location);
}
@@ -1005,13 +1650,35 @@ _outAggref(StringInfo str, const Aggref *node)
{
WRITE_NODE_TYPE("AGGREF");
+#ifdef XCP
+ if (portable_output)
+ WRITE_FUNCID_FIELD(aggfnoid);
+ else
+#endif
WRITE_OID_FIELD(aggfnoid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(aggtype);
+ else
+#endif
WRITE_OID_FIELD(aggtype);
#ifdef PGXC
+#ifndef XCP
WRITE_OID_FIELD(aggtrantype);
WRITE_BOOL_FIELD(agghas_collectfn);
+#endif /* XCP */
#endif /* PGXC */
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(aggcollid);
+ else
+#endif
WRITE_OID_FIELD(aggcollid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(inputcollid);
+ else
+#endif
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_NODE_FIELD(aggorder);
@@ -1026,9 +1693,29 @@ _outWindowFunc(StringInfo str, const WindowFunc *node)
{
WRITE_NODE_TYPE("WINDOWFUNC");
+#ifdef XCP
+ if (portable_output)
+ WRITE_FUNCID_FIELD(winfnoid);
+ else
+#endif
WRITE_OID_FIELD(winfnoid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(wintype);
+ else
+#endif
WRITE_OID_FIELD(wintype);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(wincollid);
+ else
+#endif
WRITE_OID_FIELD(wincollid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(inputcollid);
+ else
+#endif
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_UINT_FIELD(winref);
@@ -1042,9 +1729,24 @@ _outArrayRef(StringInfo str, const ArrayRef *node)
{
WRITE_NODE_TYPE("ARRAYREF");
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(refarraytype);
+ else
+#endif
WRITE_OID_FIELD(refarraytype);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(refelemtype);
+ else
+#endif
WRITE_OID_FIELD(refelemtype);
WRITE_INT_FIELD(reftypmod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(refcollid);
+ else
+#endif
WRITE_OID_FIELD(refcollid);
WRITE_NODE_FIELD(refupperindexpr);
WRITE_NODE_FIELD(reflowerindexpr);
@@ -1057,11 +1759,31 @@ _outFuncExpr(StringInfo str, const FuncExpr *node)
{
WRITE_NODE_TYPE("FUNCEXPR");
+#ifdef XCP
+ if (portable_output)
+ WRITE_FUNCID_FIELD(funcid);
+ else
+#endif
WRITE_OID_FIELD(funcid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(funcresulttype);
+ else
+#endif
WRITE_OID_FIELD(funcresulttype);
WRITE_BOOL_FIELD(funcretset);
WRITE_ENUM_FIELD(funcformat, CoercionForm);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(funccollid);
+ else
+#endif
WRITE_OID_FIELD(funccollid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(inputcollid);
+ else
+#endif
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
@@ -1083,11 +1805,36 @@ _outOpExpr(StringInfo str, const OpExpr *node)
{
WRITE_NODE_TYPE("OPEXPR");
+#ifdef XCP
+ if (portable_output)
+ WRITE_OPERID_FIELD(opno);
+ else
+#endif
WRITE_OID_FIELD(opno);
+#ifdef XCP
+ if (portable_output)
+ WRITE_FUNCID_FIELD(opfuncid);
+ else
+#endif
WRITE_OID_FIELD(opfuncid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(opresulttype);
+ else
+#endif
WRITE_OID_FIELD(opresulttype);
WRITE_BOOL_FIELD(opretset);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(opcollid);
+ else
+#endif
WRITE_OID_FIELD(opcollid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(inputcollid);
+ else
+#endif
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
@@ -1098,11 +1845,36 @@ _outDistinctExpr(StringInfo str, const DistinctExpr *node)
{
WRITE_NODE_TYPE("DISTINCTEXPR");
+#ifdef XCP
+ if (portable_output)
+ WRITE_OPERID_FIELD(opno);
+ else
+#endif
WRITE_OID_FIELD(opno);
+#ifdef XCP
+ if (portable_output)
+ WRITE_FUNCID_FIELD(opfuncid);
+ else
+#endif
WRITE_OID_FIELD(opfuncid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(opresulttype);
+ else
+#endif
WRITE_OID_FIELD(opresulttype);
WRITE_BOOL_FIELD(opretset);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(opcollid);
+ else
+#endif
WRITE_OID_FIELD(opcollid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(inputcollid);
+ else
+#endif
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
@@ -1113,11 +1885,36 @@ _outNullIfExpr(StringInfo str, const NullIfExpr *node)
{
WRITE_NODE_TYPE("NULLIFEXPR");
+#ifdef XCP
+ if (portable_output)
+ WRITE_OPERID_FIELD(opno);
+ else
+#endif
WRITE_OID_FIELD(opno);
+#ifdef XCP
+ if (portable_output)
+ WRITE_FUNCID_FIELD(opfuncid);
+ else
+#endif
WRITE_OID_FIELD(opfuncid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(opresulttype);
+ else
+#endif
WRITE_OID_FIELD(opresulttype);
WRITE_BOOL_FIELD(opretset);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(opcollid);
+ else
+#endif
WRITE_OID_FIELD(opcollid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(inputcollid);
+ else
+#endif
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
@@ -1128,9 +1925,24 @@ _outScalarArrayOpExpr(StringInfo str, const ScalarArrayOpExpr *node)
{
WRITE_NODE_TYPE("SCALARARRAYOPEXPR");
+#ifdef XCP
+ if (portable_output)
+ WRITE_OPERID_FIELD(opno);
+ else
+#endif
WRITE_OID_FIELD(opno);
+#ifdef XCP
+ if (portable_output)
+ WRITE_FUNCID_FIELD(opfuncid);
+ else
+#endif
WRITE_OID_FIELD(opfuncid);
WRITE_BOOL_FIELD(useOr);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(inputcollid);
+ else
+#endif
WRITE_OID_FIELD(inputcollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
@@ -1185,8 +1997,18 @@ _outSubPlan(StringInfo str, const SubPlan *node)
WRITE_NODE_FIELD(paramIds);
WRITE_INT_FIELD(plan_id);
WRITE_STRING_FIELD(plan_name);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(firstColType);
+ else
+#endif
WRITE_OID_FIELD(firstColType);
WRITE_INT_FIELD(firstColTypmod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(firstColCollation);
+ else
+#endif
WRITE_OID_FIELD(firstColCollation);
WRITE_BOOL_FIELD(useHashTable);
WRITE_BOOL_FIELD(unknownEqFalse);
@@ -1212,8 +2034,18 @@ _outFieldSelect(StringInfo str, const FieldSelect *node)
WRITE_NODE_FIELD(arg);
WRITE_INT_FIELD(fieldnum);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(resulttype);
+ else
+#endif
WRITE_OID_FIELD(resulttype);
WRITE_INT_FIELD(resulttypmod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(resultcollid);
+ else
+#endif
WRITE_OID_FIELD(resultcollid);
}
@@ -1225,6 +2057,11 @@ _outFieldStore(StringInfo str, const FieldStore *node)
WRITE_NODE_FIELD(arg);
WRITE_NODE_FIELD(newvals);
WRITE_NODE_FIELD(fieldnums);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(resulttype);
+ else
+#endif
WRITE_OID_FIELD(resulttype);
}
@@ -1234,8 +2071,18 @@ _outRelabelType(StringInfo str, const RelabelType *node)
WRITE_NODE_TYPE("RELABELTYPE");
WRITE_NODE_FIELD(arg);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(resulttype);
+ else
+#endif
WRITE_OID_FIELD(resulttype);
WRITE_INT_FIELD(resulttypmod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(resultcollid);
+ else
+#endif
WRITE_OID_FIELD(resultcollid);
WRITE_ENUM_FIELD(relabelformat, CoercionForm);
WRITE_LOCATION_FIELD(location);
@@ -1247,7 +2094,17 @@ _outCoerceViaIO(StringInfo str, const CoerceViaIO *node)
WRITE_NODE_TYPE("COERCEVIAIO");
WRITE_NODE_FIELD(arg);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(resulttype);
+ else
+#endif
WRITE_OID_FIELD(resulttype);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(resultcollid);
+ else
+#endif
WRITE_OID_FIELD(resultcollid);
WRITE_ENUM_FIELD(coerceformat, CoercionForm);
WRITE_LOCATION_FIELD(location);
@@ -1259,9 +2116,24 @@ _outArrayCoerceExpr(StringInfo str, const ArrayCoerceExpr *node)
WRITE_NODE_TYPE("ARRAYCOERCEEXPR");
WRITE_NODE_FIELD(arg);
+#ifdef XCP
+ if (portable_output)
+ WRITE_FUNCID_FIELD(elemfuncid);
+ else
+#endif
WRITE_OID_FIELD(elemfuncid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(resulttype);
+ else
+#endif
WRITE_OID_FIELD(resulttype);
WRITE_INT_FIELD(resulttypmod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(resultcollid);
+ else
+#endif
WRITE_OID_FIELD(resultcollid);
WRITE_BOOL_FIELD(isExplicit);
WRITE_ENUM_FIELD(coerceformat, CoercionForm);
@@ -1274,6 +2146,11 @@ _outConvertRowtypeExpr(StringInfo str, const ConvertRowtypeExpr *node)
WRITE_NODE_TYPE("CONVERTROWTYPEEXPR");
WRITE_NODE_FIELD(arg);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(resulttype);
+ else
+#endif
WRITE_OID_FIELD(resulttype);
WRITE_ENUM_FIELD(convertformat, CoercionForm);
WRITE_LOCATION_FIELD(location);
@@ -1294,7 +2171,17 @@ _outCaseExpr(StringInfo str, const CaseExpr *node)
{
WRITE_NODE_TYPE("CASE");
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(casetype);
+ else
+#endif
WRITE_OID_FIELD(casetype);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(casecollid);
+ else
+#endif
WRITE_OID_FIELD(casecollid);
WRITE_NODE_FIELD(arg);
WRITE_NODE_FIELD(args);
@@ -1317,8 +2204,18 @@ _outCaseTestExpr(StringInfo str, const CaseTestExpr *node)
{
WRITE_NODE_TYPE("CASETESTEXPR");
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(typeId);
+ else
+#endif
WRITE_OID_FIELD(typeId);
WRITE_INT_FIELD(typeMod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(collation);
+ else
+#endif
WRITE_OID_FIELD(collation);
}
@@ -1327,8 +2224,23 @@ _outArrayExpr(StringInfo str, const ArrayExpr *node)
{
WRITE_NODE_TYPE("ARRAY");
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(array_typeid);
+ else
+#endif
WRITE_OID_FIELD(array_typeid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(array_collid);
+ else
+#endif
WRITE_OID_FIELD(array_collid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(element_typeid);
+ else
+#endif
WRITE_OID_FIELD(element_typeid);
WRITE_NODE_FIELD(elements);
WRITE_BOOL_FIELD(multidims);
@@ -1341,6 +2253,11 @@ _outRowExpr(StringInfo str, const RowExpr *node)
WRITE_NODE_TYPE("ROW");
WRITE_NODE_FIELD(args);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(row_typeid);
+ else
+#endif
WRITE_OID_FIELD(row_typeid);
WRITE_ENUM_FIELD(row_format, CoercionForm);
WRITE_NODE_FIELD(colnames);
@@ -1365,7 +2282,17 @@ _outCoalesceExpr(StringInfo str, const CoalesceExpr *node)
{
WRITE_NODE_TYPE("COALESCE");
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(coalescetype);
+ else
+#endif
WRITE_OID_FIELD(coalescetype);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(coalescecollid);
+ else
+#endif
WRITE_OID_FIELD(coalescecollid);
WRITE_NODE_FIELD(args);
WRITE_LOCATION_FIELD(location);
@@ -1376,8 +2303,23 @@ _outMinMaxExpr(StringInfo str, const MinMaxExpr *node)
{
WRITE_NODE_TYPE("MINMAX");
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(minmaxtype);
+ else
+#endif
WRITE_OID_FIELD(minmaxtype);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(minmaxcollid);
+ else
+#endif
WRITE_OID_FIELD(minmaxcollid);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(inputcollid);
+ else
+#endif
WRITE_OID_FIELD(inputcollid);
WRITE_ENUM_FIELD(op, MinMaxOp);
WRITE_NODE_FIELD(args);
@@ -1395,6 +2337,11 @@ _outXmlExpr(StringInfo str, const XmlExpr *node)
WRITE_NODE_FIELD(arg_names);
WRITE_NODE_FIELD(args);
WRITE_ENUM_FIELD(xmloption, XmlOptionType);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(type);
+ else
+#endif
WRITE_OID_FIELD(type);
WRITE_INT_FIELD(typmod);
WRITE_LOCATION_FIELD(location);
@@ -1425,8 +2372,18 @@ _outCoerceToDomain(StringInfo str, const CoerceToDomain *node)
WRITE_NODE_TYPE("COERCETODOMAIN");
WRITE_NODE_FIELD(arg);
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(resulttype);
+ else
+#endif
WRITE_OID_FIELD(resulttype);
WRITE_INT_FIELD(resulttypmod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(resultcollid);
+ else
+#endif
WRITE_OID_FIELD(resultcollid);
WRITE_ENUM_FIELD(coercionformat, CoercionForm);
WRITE_LOCATION_FIELD(location);
@@ -1437,8 +2394,18 @@ _outCoerceToDomainValue(StringInfo str, const CoerceToDomainValue *node)
{
WRITE_NODE_TYPE("COERCETODOMAINVALUE");
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(typeId);
+ else
+#endif
WRITE_OID_FIELD(typeId);
WRITE_INT_FIELD(typeMod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(collation);
+ else
+#endif
WRITE_OID_FIELD(collation);
WRITE_LOCATION_FIELD(location);
}
@@ -1448,8 +2415,18 @@ _outSetToDefault(StringInfo str, const SetToDefault *node)
{
WRITE_NODE_TYPE("SETTODEFAULT");
+#ifdef XCP
+ if (portable_output)
+ WRITE_TYPID_FIELD(typeId);
+ else
+#endif
WRITE_OID_FIELD(typeId);
WRITE_INT_FIELD(typeMod);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(collation);
+ else
+#endif
WRITE_OID_FIELD(collation);
WRITE_LOCATION_FIELD(location);
}
@@ -1473,6 +2450,11 @@ _outTargetEntry(StringInfo str, const TargetEntry *node)
WRITE_INT_FIELD(resno);
WRITE_STRING_FIELD(resname);
WRITE_UINT_FIELD(ressortgroupref);
+#ifdef XCP
+ if (portable_output)
+ WRITE_RELID_FIELD(resorigtbl);
+ else
+#endif
WRITE_OID_FIELD(resorigtbl);
WRITE_INT_FIELD(resorigcol);
WRITE_BOOL_FIELD(resjunk);
@@ -1777,9 +2759,11 @@ _outPlannerInfo(StringInfo str, const PlannerInfo *node)
WRITE_BOOL_FIELD(hasPseudoConstantQuals);
WRITE_BOOL_FIELD(hasRecursion);
#ifdef PGXC
+#ifndef XCP
WRITE_INT_FIELD(rs_alias_index);
WRITE_NODE_FIELD(xc_rowMarks);
-#endif
+#endif /* XCP */
+#endif /* PGXC */
WRITE_INT_FIELD(wt_param_id);
WRITE_BITMAPSET_FIELD(curOuterRels);
WRITE_NODE_FIELD(curOuterParams);
@@ -1853,6 +2837,11 @@ _outEquivalenceClass(StringInfo str, const EquivalenceClass *node)
WRITE_NODE_TYPE("EQUIVALENCECLASS");
WRITE_NODE_FIELD(ec_opfamilies);
+#ifdef XCP
+ if (portable_output)
+ WRITE_COLLID_FIELD(ec_collation);
+ else
+#endif
WRITE_OID_FIELD(ec_collation);
WRITE_NODE_FIELD(ec_members);
WRITE_NODE_FIELD(ec_sources);
@@ -1964,6 +2953,11 @@ _outAppendRelInfo(StringInfo str, const AppendRelInfo *node)
WRITE_OID_FIELD(parent_reltype);
WRITE_OID_FIELD(child_reltype);
WRITE_NODE_FIELD(translated_vars);
+#ifdef XCP
+ if (portable_output)
+ WRITE_RELID_FIELD(parent_reloid);
+ else
+#endif
WRITE_OID_FIELD(parent_reloid);
}
@@ -2303,7 +3297,17 @@ _outSortGroupClause(StringInfo str, const SortGroupClause *node)
WRITE_NODE_TYPE("SORTGROUPCLAUSE");
WRITE_UINT_FIELD(tleSortGroupRef);
+#ifdef XCP
+ if (portable_output)
+ WRITE_OPERID_FIELD(eqop);
+ else
+#endif
WRITE_OID_FIELD(eqop);
+#ifdef XCP
+ if (portable_output)
+ WRITE_OPERID_FIELD(sortop);
+ else
+#endif
WRITE_OID_FIELD(sortop);
WRITE_BOOL_FIELD(nulls_first);
WRITE_BOOL_FIELD(hashable);
@@ -2388,12 +3392,19 @@ _outRangeTblEntry(StringInfo str, const RangeTblEntry *node)
WRITE_NODE_FIELD(eref);
WRITE_ENUM_FIELD(rtekind, RTEKind);
#ifdef PGXC
+#ifndef XCP
WRITE_STRING_FIELD(relname);
#endif
+#endif
switch (node->rtekind)
{
case RTE_RELATION:
+#ifdef XCP
+ if (portable_output)
+ WRITE_RELID_FIELD(relid);
+ else
+#endif
WRITE_OID_FIELD(relid);
WRITE_CHAR_FIELD(relkind);
break;
@@ -2436,6 +3447,12 @@ _outRangeTblEntry(StringInfo str, const RangeTblEntry *node)
WRITE_BOOL_FIELD(inh);
WRITE_BOOL_FIELD(inFromCl);
WRITE_UINT_FIELD(requiredPerms);
+#ifdef XCP
+ /* no check on data node, consider it is trusted */
+ if (portable_output)
+ appendStringInfo(str, " :checkAsUser %u", InvalidOid);
+ else
+#endif
WRITE_OID_FIELD(checkAsUser);
WRITE_BITMAPSET_FIELD(selectedCols);
WRITE_BITMAPSET_FIELD(modifiedCols);
@@ -2889,6 +3906,17 @@ _outNode(StringInfo str, const void *obj)
case T_NestLoopParam:
_outNestLoopParam(str, obj);
break;
+#ifdef XCP
+ case T_RemoteSubplan:
+ _outRemoteSubplan(str, obj);
+ break;
+ case T_RemoteStmt:
+ _outRemoteStmt(str, obj);
+ break;
+ case T_SimpleSort:
+ _outSimpleSort(str, obj);
+ break;
+#endif
case T_PlanRowMark:
_outPlanRowMark(str, obj);
break;
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 91b2bf26b1..ef59ee1c4a 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -3,6 +3,11 @@
* readfuncs.c
* Reader functions for Postgres tree nodes.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -34,6 +39,32 @@
#ifdef PGXC
#include "access/htup.h"
#endif
+#ifdef XCP
+#include "fmgr.h"
+#include "catalog/namespace.h"
+#include "nodes/plannodes.h"
+#include "pgxc/execRemote.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+
+
+/*
+ * When we sending query plans between nodes we need to send OIDs of various
+ * objects - relations, data types, functions, etc.
+ * On different nodes OIDs of these objects may differ, so we need to send an
+ * identifier, depending on object type, allowing to lookup OID on target node.
+ * On the other hand we want to save space when storing rules, or in other cases
+ * when we need to encode and decode nodes on the same node.
+ * For now default format is not portable, as it is in original Postgres code.
+ * Later we may want to add extra parameter in stringToNode() function
+ */
+static bool portable_input = false;
+void
+set_portable_input(bool value)
+{
+ portable_input = value;
+}
+#endif /* XCP */
/*
* Macros to simplify reading of different kinds of fields. Use these
@@ -71,11 +102,27 @@
token = pg_strtok(&length); /* get field value */ \
local_node->fldname = atoui(token)
+#ifdef XCP
+/* Read a long integer field (anything written as ":fldname %ld") */
+#define READ_LONG_FIELD(fldname) \
+ token = pg_strtok(&length); /* skip :fldname */ \
+ token = pg_strtok(&length); /* get field value */ \
+ local_node->fldname = atol(token)
+#endif
+
/* Read an OID field (don't hard-wire assumption that OID is same as uint) */
+#ifdef XCP
+#define READ_OID_FIELD(fldname) \
+ (AssertMacro(!portable_input), /* only allow to read OIDs within a node */ \
+ token = pg_strtok(&length), /* skip :fldname */ \
+ token = pg_strtok(&length), /* get field value */ \
+ local_node->fldname = atooid(token))
+#else
#define READ_OID_FIELD(fldname) \
token = pg_strtok(&length); /* skip :fldname */ \
token = pg_strtok(&length); /* get field value */ \
local_node->fldname = atooid(token)
+#endif
/* Read a char field (ie, one ascii character) */
#define READ_CHAR_FIELD(fldname) \
@@ -123,6 +170,203 @@
token = pg_strtok(&length); /* skip :fldname */ \
local_node->fldname = _readBitmapset()
+#ifdef XCP
+/* Read fields of a Plan node */
+#define READ_PLAN_FIELDS(nodeTypeName) \
+ Plan *plan_node; \
+ READ_LOCALS(nodeTypeName); \
+ plan_node = (Plan *) local_node; \
+ token = pg_strtok(&length); /* skip :startup_cost */ \
+ token = pg_strtok(&length); /* get field value */ \
+ plan_node->startup_cost = atof(token); \
+ token = pg_strtok(&length); /* skip :total_cost */ \
+ token = pg_strtok(&length); /* get field value */ \
+ plan_node->total_cost = atof(token); \
+ token = pg_strtok(&length); /* skip :plan_rows */ \
+ token = pg_strtok(&length); /* get field value */ \
+ plan_node->plan_rows = atof(token); \
+ token = pg_strtok(&length); /* skip :plan_width */ \
+ token = pg_strtok(&length); /* get field value */ \
+ plan_node->plan_width = atoi(token); \
+ token = pg_strtok(&length); /* skip :targetlist */ \
+ plan_node->targetlist = nodeRead(NULL, 0); \
+ token = pg_strtok(&length); /* skip :qual */ \
+ plan_node->qual = nodeRead(NULL, 0); \
+ token = pg_strtok(&length); /* skip :lefttree */ \
+ plan_node->lefttree = nodeRead(NULL, 0); \
+ token = pg_strtok(&length); /* skip :righttree */ \
+ plan_node->righttree = nodeRead(NULL, 0); \
+ token = pg_strtok(&length); /* skip :initPlan */ \
+ plan_node->initPlan = nodeRead(NULL, 0); \
+ token = pg_strtok(&length); /* skip :extParam */ \
+ plan_node->extParam = _readBitmapset(); \
+ token = pg_strtok(&length); /* skip :allParam */ \
+ plan_node->allParam = _readBitmapset()
+
+/* Read fields of a Scan node */
+#define READ_SCAN_FIELDS(nodeTypeName) \
+ Scan *scan_node; \
+ READ_PLAN_FIELDS(nodeTypeName); \
+ scan_node = (Scan *) local_node; \
+ token = pg_strtok(&length); /* skip :scanrelid */ \
+ token = pg_strtok(&length); /* get field value */ \
+ scan_node->scanrelid = atoi(token)
+
+/* Read fields of a Join node */
+#define READ_JOIN_FIELDS(nodeTypeName) \
+ Join *join_node; \
+ READ_PLAN_FIELDS(nodeTypeName); \
+ join_node = (Join *) local_node; \
+ token = pg_strtok(&length); /* skip :jointype */ \
+ token = pg_strtok(&length); /* get field value */ \
+ join_node->jointype = (JoinType) atoi(token); \
+ token = pg_strtok(&length); /* skip :joinqual */ \
+ join_node->joinqual = nodeRead(NULL, 0)
+
+/*
+ * Macros to read an identifier and lookup the OID
+ * The identifier depends on object type.
+ */
+#define NSP_OID(nspname) LookupNamespaceNoError(nspname)
+
+/* Read relation identifier and lookup the OID */
+#define READ_RELID_FIELD(fldname) \
+ do { \
+ char *nspname; /* namespace name */ \
+ char *relname; /* relation name */ \
+ token = pg_strtok(&length); /* skip :fldname */ \
+ token = pg_strtok(&length); /* get nspname */ \
+ nspname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* get relname */ \
+ relname = nullable_string(token, length); \
+ if (relname) \
+ local_node->fldname = get_relname_relid(relname, \
+ NSP_OID(nspname)); \
+ else \
+ local_node->fldname = InvalidOid; \
+ } while (0)
+
+/* Read data type identifier and lookup the OID */
+#define READ_TYPID_FIELD(fldname) \
+ do { \
+ char *nspname; /* namespace name */ \
+ char *typname; /* data type name */ \
+ token = pg_strtok(&length); /* skip :fldname */ \
+ token = pg_strtok(&length); /* get nspname */ \
+ nspname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* get typname */ \
+ typname = nullable_string(token, length); \
+ if (typname) \
+ local_node->fldname = get_typname_typid(typname, \
+ NSP_OID(nspname)); \
+ else \
+ local_node->fldname = InvalidOid; \
+ } while (0)
+
+/* Read function identifier and lookup the OID */
+#define READ_FUNCID_FIELD(fldname) \
+ do { \
+ char *nspname; /* namespace name */ \
+ char *funcname; /* function name */ \
+ int nargs; /* number of arguments */ \
+ Oid *argtypes; /* argument types */ \
+ token = pg_strtok(&length); /* skip :fldname */ \
+ token = pg_strtok(&length); /* get nspname */ \
+ nspname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* get funcname */ \
+ funcname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* get nargs */ \
+ nargs = atoi(token); \
+ if (funcname) \
+ { \
+ int i; \
+ argtypes = palloc(nargs * sizeof(Oid)); \
+ for (i = 0; i < nargs; i++) \
+ { \
+ char *typnspname; /* argument type namespace */ \
+ char *typname; /* argument type name */ \
+ token = pg_strtok(&length); /* get type nspname */ \
+ typnspname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* get type name */ \
+ typname = nullable_string(token, length); \
+ argtypes[i] = get_typname_typid(typname, \
+ NSP_OID(typnspname)); \
+ } \
+ local_node->fldname = get_funcid(funcname, \
+ buildoidvector(argtypes, nargs), \
+ NSP_OID(nspname)); \
+ } \
+ else \
+ local_node->fldname = InvalidOid; \
+ } while (0)
+
+/* Read operator identifier and lookup the OID */
+#define READ_OPERID_FIELD(fldname) \
+ do { \
+ char *nspname; /* namespace name */ \
+ char *oprname; /* operator name */ \
+ char *leftnspname; /* left type namespace */ \
+ char *leftname; /* left type name */ \
+ Oid oprleft; /* left type */ \
+ char *rightnspname; /* right type namespace */ \
+ char *rightname; /* right type name */ \
+ Oid oprright; /* right type */ \
+ token = pg_strtok(&length); /* skip :fldname */ \
+ token = pg_strtok(&length); /* get nspname */ \
+ nspname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* get operator name */ \
+ oprname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* left type namespace */ \
+ leftnspname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* left type name */ \
+ leftname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* right type namespace */ \
+ rightnspname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* right type name */ \
+ rightname = nullable_string(token, length); \
+ if (oprname) \
+ { \
+ if (leftname) \
+ oprleft = get_typname_typid(leftname, \
+ NSP_OID(leftnspname)); \
+ else \
+ oprleft = InvalidOid; \
+ if (rightname) \
+ oprright = get_typname_typid(rightname, \
+ NSP_OID(rightnspname)); \
+ else \
+ oprright = InvalidOid; \
+ local_node->fldname = get_operid(oprname, \
+ oprleft, \
+ oprright, \
+ NSP_OID(nspname)); \
+ } \
+ else \
+ local_node->fldname = InvalidOid; \
+ } while (0)
+
+/* Read collation identifier and lookup the OID */
+#define READ_COLLID_FIELD(fldname) \
+ do { \
+ char *nspname; /* namespace name */ \
+ char *collname; /* collation name */ \
+ int collencoding; /* collation encoding */ \
+ token = pg_strtok(&length); /* skip :fldname */ \
+ token = pg_strtok(&length); /* get nspname */ \
+ nspname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* get collname */ \
+ collname = nullable_string(token, length); \
+ token = pg_strtok(&length); /* get collencoding */ \
+ collencoding = atoi(token); \
+ if (collname) \
+ local_node->fldname = get_collid(collname, \
+ collencoding, \
+ NSP_OID(nspname)); \
+ else \
+ local_node->fldname = InvalidOid; \
+ } while (0)
+#endif
+
/* Routine exit */
#define READ_DONE() \
return local_node
@@ -145,6 +389,9 @@
static Datum readDatum(bool typbyval);
+#ifdef XCP
+static Datum scanDatum(Oid typid, int typmod);
+#endif
/*
* _readBitmapset
@@ -266,7 +513,17 @@ _readSortGroupClause(void)
READ_LOCALS(SortGroupClause);
READ_UINT_FIELD(tleSortGroupRef);
+#ifdef XCP
+ if (portable_input)
+ READ_OPERID_FIELD(eqop);
+ else
+#endif
READ_OID_FIELD(eqop);
+#ifdef XCP
+ if (portable_input)
+ READ_OPERID_FIELD(sortop);
+ else
+#endif
READ_OID_FIELD(sortop);
READ_BOOL_FIELD(nulls_first);
READ_BOOL_FIELD(hashable);
@@ -412,8 +669,18 @@ _readVar(void)
READ_UINT_FIELD(varno);
READ_INT_FIELD(varattno);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(vartype);
+ else
+#endif
READ_OID_FIELD(vartype);
READ_INT_FIELD(vartypmod);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(varcollid);
+ else
+#endif
READ_OID_FIELD(varcollid);
READ_UINT_FIELD(varlevelsup);
READ_UINT_FIELD(varnoold);
@@ -431,8 +698,18 @@ _readConst(void)
{
READ_LOCALS(Const);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(consttype);
+ else
+#endif
READ_OID_FIELD(consttype);
READ_INT_FIELD(consttypmod);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(constcollid);
+ else
+#endif
READ_OID_FIELD(constcollid);
READ_INT_FIELD(constlen);
READ_BOOL_FIELD(constbyval);
@@ -443,6 +720,12 @@ _readConst(void)
if (local_node->constisnull)
token = pg_strtok(&length); /* skip "<>" */
else
+#ifdef XCP
+ if (portable_input)
+ local_node->constvalue = scanDatum(local_node->consttype,
+ local_node->consttypmod);
+ else
+#endif
local_node->constvalue = readDatum(local_node->constbyval);
READ_DONE();
@@ -458,8 +741,18 @@ _readParam(void)
READ_ENUM_FIELD(paramkind, ParamKind);
READ_INT_FIELD(paramid);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(paramtype);
+ else
+#endif
READ_OID_FIELD(paramtype);
READ_INT_FIELD(paramtypmod);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(paramcollid);
+ else
+#endif
READ_OID_FIELD(paramcollid);
READ_LOCATION_FIELD(location);
@@ -474,13 +767,35 @@ _readAggref(void)
{
READ_LOCALS(Aggref);
+#ifdef XCP
+ if (portable_input)
+ READ_FUNCID_FIELD(aggfnoid);
+ else
+#endif
READ_OID_FIELD(aggfnoid);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(aggtype);
+ else
+#endif
READ_OID_FIELD(aggtype);
#ifdef PGXC
+#ifndef XCP
READ_OID_FIELD(aggtrantype);
READ_BOOL_FIELD(agghas_collectfn);
+#endif /* XCP */
#endif /* PGXC */
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(aggcollid);
+ else
+#endif
READ_OID_FIELD(aggcollid);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(inputcollid);
+ else
+#endif
READ_OID_FIELD(inputcollid);
READ_NODE_FIELD(args);
READ_NODE_FIELD(aggorder);
@@ -500,9 +815,29 @@ _readWindowFunc(void)
{
READ_LOCALS(WindowFunc);
+#ifdef XCP
+ if (portable_input)
+ READ_FUNCID_FIELD(winfnoid);
+ else
+#endif
READ_OID_FIELD(winfnoid);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(wintype);
+ else
+#endif
READ_OID_FIELD(wintype);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(wincollid);
+ else
+#endif
READ_OID_FIELD(wincollid);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(inputcollid);
+ else
+#endif
READ_OID_FIELD(inputcollid);
READ_NODE_FIELD(args);
READ_UINT_FIELD(winref);
@@ -521,9 +856,24 @@ _readArrayRef(void)
{
READ_LOCALS(ArrayRef);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(refarraytype);
+ else
+#endif
READ_OID_FIELD(refarraytype);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(refelemtype);
+ else
+#endif
READ_OID_FIELD(refelemtype);
READ_INT_FIELD(reftypmod);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(refcollid);
+ else
+#endif
READ_OID_FIELD(refcollid);
READ_NODE_FIELD(refupperindexpr);
READ_NODE_FIELD(reflowerindexpr);
@@ -541,11 +891,31 @@ _readFuncExpr(void)
{
READ_LOCALS(FuncExpr);
+#ifdef XCP
+ if (portable_input)
+ READ_FUNCID_FIELD(funcid);
+ else
+#endif
READ_OID_FIELD(funcid);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(funcresulttype);
+ else
+#endif
READ_OID_FIELD(funcresulttype);
READ_BOOL_FIELD(funcretset);
READ_ENUM_FIELD(funcformat, CoercionForm);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(funccollid);
+ else
+#endif
READ_OID_FIELD(funccollid);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(inputcollid);
+ else
+#endif
READ_OID_FIELD(inputcollid);
READ_NODE_FIELD(args);
READ_LOCATION_FIELD(location);
@@ -577,9 +947,20 @@ _readOpExpr(void)
{
READ_LOCALS(OpExpr);
+#ifdef XCP
+ if (portable_input)
+ READ_OPERID_FIELD(opno);
+ else
+#endif
READ_OID_FIELD(opno);
+#ifdef XCP
+ if (portable_input)
+ READ_FUNCID_FIELD(opfuncid);
+ else
+#endif
READ_OID_FIELD(opfuncid);
+#ifndef XCP
/*
* The opfuncid is stored in the textual format primarily for debugging
* and documentation reasons. We want to always read it as zero to force
@@ -589,10 +970,26 @@ _readOpExpr(void)
* someday.)
*/
local_node->opfuncid = InvalidOid;
+#endif
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(opresulttype);
+ else
+#endif
READ_OID_FIELD(opresulttype);
READ_BOOL_FIELD(opretset);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(opcollid);
+ else
+#endif
READ_OID_FIELD(opcollid);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(inputcollid);
+ else
+#endif
READ_OID_FIELD(inputcollid);
READ_NODE_FIELD(args);
READ_LOCATION_FIELD(location);
@@ -608,9 +1005,20 @@ _readDistinctExpr(void)
{
READ_LOCALS(DistinctExpr);
+#ifdef XCP
+ if (portable_input)
+ READ_OPERID_FIELD(opno);
+ else
+#endif
READ_OID_FIELD(opno);
+#ifdef XCP
+ if (portable_input)
+ READ_FUNCID_FIELD(opfuncid);
+ else
+#endif
READ_OID_FIELD(opfuncid);
+#ifndef XCP
/*
* The opfuncid is stored in the textual format primarily for debugging
* and documentation reasons. We want to always read it as zero to force
@@ -620,10 +1028,26 @@ _readDistinctExpr(void)
* someday.)
*/
local_node->opfuncid = InvalidOid;
+#endif
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(opresulttype);
+ else
+#endif
READ_OID_FIELD(opresulttype);
READ_BOOL_FIELD(opretset);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(opcollid);
+ else
+#endif
READ_OID_FIELD(opcollid);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(inputcollid);
+ else
+#endif
READ_OID_FIELD(inputcollid);
READ_NODE_FIELD(args);
READ_LOCATION_FIELD(location);
@@ -639,7 +1063,17 @@ _readNullIfExpr(void)
{
READ_LOCALS(NullIfExpr);
+#ifdef XCP
+ if (portable_input)
+ READ_OPERID_FIELD(opno);
+ else
+#endif
READ_OID_FIELD(opno);
+#ifdef XCP
+ if (portable_input)
+ READ_FUNCID_FIELD(opfuncid);
+ else
+#endif
READ_OID_FIELD(opfuncid);
/*
@@ -650,11 +1084,30 @@ _readNullIfExpr(void)
* (We don't currently support an ALTER OPERATOR command, but might
* someday.)
*/
+#ifdef XCP
+ /* Do not invalidate if we have just looked up the value */
+ if (!portable_input)
+#endif
local_node->opfuncid = InvalidOid;
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(opresulttype);
+ else
+#endif
READ_OID_FIELD(opresulttype);
READ_BOOL_FIELD(opretset);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(opcollid);
+ else
+#endif
READ_OID_FIELD(opcollid);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(inputcollid);
+ else
+#endif
READ_OID_FIELD(inputcollid);
READ_NODE_FIELD(args);
READ_LOCATION_FIELD(location);
@@ -670,9 +1123,19 @@ _readScalarArrayOpExpr(void)
{
READ_LOCALS(ScalarArrayOpExpr);
+#ifdef XCP
+ if (portable_input)
+ READ_OPERID_FIELD(opno);
+ else
+#endif
READ_OID_FIELD(opno);
+#ifdef XCP
+ if (portable_input)
+ READ_FUNCID_FIELD(opfuncid);
+ else
+#endif
READ_OID_FIELD(opfuncid);
-
+#ifndef XCP
/*
* The opfuncid is stored in the textual format primarily for debugging
* and documentation reasons. We want to always read it as zero to force
@@ -682,8 +1145,14 @@ _readScalarArrayOpExpr(void)
* someday.)
*/
local_node->opfuncid = InvalidOid;
+#endif
READ_BOOL_FIELD(useOr);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(inputcollid);
+ else
+#endif
READ_OID_FIELD(inputcollid);
READ_NODE_FIELD(args);
READ_LOCATION_FIELD(location);
@@ -734,9 +1203,40 @@ _readSubLink(void)
READ_DONE();
}
+#ifdef XCP
/*
* _readSubPlan is not needed since it doesn't appear in stored rules.
*/
+static SubPlan *
+_readSubPlan(void)
+{
+ READ_LOCALS(SubPlan);
+
+ READ_ENUM_FIELD(subLinkType, SubLinkType);
+ READ_NODE_FIELD(testexpr);
+ READ_NODE_FIELD(paramIds);
+ READ_INT_FIELD(plan_id);
+ READ_STRING_FIELD(plan_name);
+ if (portable_input)
+ READ_TYPID_FIELD(firstColType);
+ else
+ READ_OID_FIELD(firstColType);
+ READ_INT_FIELD(firstColTypmod);
+ if (portable_input)
+ READ_COLLID_FIELD(firstColCollation);
+ else
+ READ_OID_FIELD(firstColCollation);
+ READ_BOOL_FIELD(useHashTable);
+ READ_BOOL_FIELD(unknownEqFalse);
+ READ_NODE_FIELD(setParam);
+ READ_NODE_FIELD(parParam);
+ READ_NODE_FIELD(args);
+ READ_FLOAT_FIELD(startup_cost);
+ READ_FLOAT_FIELD(per_call_cost);
+
+ READ_DONE();
+}
+#endif
/*
* _readFieldSelect
@@ -748,8 +1248,18 @@ _readFieldSelect(void)
READ_NODE_FIELD(arg);
READ_INT_FIELD(fieldnum);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(resulttype);
+ else
+#endif
READ_OID_FIELD(resulttype);
READ_INT_FIELD(resulttypmod);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(resultcollid);
+ else
+#endif
READ_OID_FIELD(resultcollid);
READ_DONE();
@@ -766,6 +1276,11 @@ _readFieldStore(void)
READ_NODE_FIELD(arg);
READ_NODE_FIELD(newvals);
READ_NODE_FIELD(fieldnums);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(resulttype);
+ else
+#endif
READ_OID_FIELD(resulttype);
READ_DONE();
@@ -780,8 +1295,18 @@ _readRelabelType(void)
READ_LOCALS(RelabelType);
READ_NODE_FIELD(arg);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(resulttype);
+ else
+#endif
READ_OID_FIELD(resulttype);
READ_INT_FIELD(resulttypmod);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(resultcollid);
+ else
+#endif
READ_OID_FIELD(resultcollid);
READ_ENUM_FIELD(relabelformat, CoercionForm);
READ_LOCATION_FIELD(location);
@@ -798,7 +1323,17 @@ _readCoerceViaIO(void)
READ_LOCALS(CoerceViaIO);
READ_NODE_FIELD(arg);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(resulttype);
+ else
+#endif
READ_OID_FIELD(resulttype);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(resultcollid);
+ else
+#endif
READ_OID_FIELD(resultcollid);
READ_ENUM_FIELD(coerceformat, CoercionForm);
READ_LOCATION_FIELD(location);
@@ -815,9 +1350,24 @@ _readArrayCoerceExpr(void)
READ_LOCALS(ArrayCoerceExpr);
READ_NODE_FIELD(arg);
+#ifdef XCP
+ if (portable_input)
+ READ_FUNCID_FIELD(elemfuncid);
+ else
+#endif
READ_OID_FIELD(elemfuncid);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(resulttype);
+ else
+#endif
READ_OID_FIELD(resulttype);
READ_INT_FIELD(resulttypmod);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(resultcollid);
+ else
+#endif
READ_OID_FIELD(resultcollid);
READ_BOOL_FIELD(isExplicit);
READ_ENUM_FIELD(coerceformat, CoercionForm);
@@ -835,6 +1385,11 @@ _readConvertRowtypeExpr(void)
READ_LOCALS(ConvertRowtypeExpr);
READ_NODE_FIELD(arg);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(resulttype);
+ else
+#endif
READ_OID_FIELD(resulttype);
READ_ENUM_FIELD(convertformat, CoercionForm);
READ_LOCATION_FIELD(location);
@@ -865,7 +1420,17 @@ _readCaseExpr(void)
{
READ_LOCALS(CaseExpr);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(casetype);
+ else
+#endif
READ_OID_FIELD(casetype);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(casecollid);
+ else
+#endif
READ_OID_FIELD(casecollid);
READ_NODE_FIELD(arg);
READ_NODE_FIELD(args);
@@ -898,8 +1463,18 @@ _readCaseTestExpr(void)
{
READ_LOCALS(CaseTestExpr);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(typeId);
+ else
+#endif
READ_OID_FIELD(typeId);
READ_INT_FIELD(typeMod);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(collation);
+ else
+#endif
READ_OID_FIELD(collation);
READ_DONE();
@@ -913,8 +1488,23 @@ _readArrayExpr(void)
{
READ_LOCALS(ArrayExpr);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(array_typeid);
+ else
+#endif
READ_OID_FIELD(array_typeid);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(array_collid);
+ else
+#endif
READ_OID_FIELD(array_collid);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(element_typeid);
+ else
+#endif
READ_OID_FIELD(element_typeid);
READ_NODE_FIELD(elements);
READ_BOOL_FIELD(multidims);
@@ -932,6 +1522,11 @@ _readRowExpr(void)
READ_LOCALS(RowExpr);
READ_NODE_FIELD(args);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(row_typeid);
+ else
+#endif
READ_OID_FIELD(row_typeid);
READ_ENUM_FIELD(row_format, CoercionForm);
READ_NODE_FIELD(colnames);
@@ -966,7 +1561,17 @@ _readCoalesceExpr(void)
{
READ_LOCALS(CoalesceExpr);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(coalescetype);
+ else
+#endif
READ_OID_FIELD(coalescetype);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(coalescecollid);
+ else
+#endif
READ_OID_FIELD(coalescecollid);
READ_NODE_FIELD(args);
READ_LOCATION_FIELD(location);
@@ -982,8 +1587,23 @@ _readMinMaxExpr(void)
{
READ_LOCALS(MinMaxExpr);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(minmaxtype);
+ else
+#endif
READ_OID_FIELD(minmaxtype);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(minmaxcollid);
+ else
+#endif
READ_OID_FIELD(minmaxcollid);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(inputcollid);
+ else
+#endif
READ_OID_FIELD(inputcollid);
READ_ENUM_FIELD(op, MinMaxOp);
READ_NODE_FIELD(args);
@@ -1006,6 +1626,11 @@ _readXmlExpr(void)
READ_NODE_FIELD(arg_names);
READ_NODE_FIELD(args);
READ_ENUM_FIELD(xmloption, XmlOptionType);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(type);
+ else
+#endif
READ_OID_FIELD(type);
READ_INT_FIELD(typmod);
READ_LOCATION_FIELD(location);
@@ -1051,8 +1676,18 @@ _readCoerceToDomain(void)
READ_LOCALS(CoerceToDomain);
READ_NODE_FIELD(arg);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(resulttype);
+ else
+#endif
READ_OID_FIELD(resulttype);
READ_INT_FIELD(resulttypmod);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(resultcollid);
+ else
+#endif
READ_OID_FIELD(resultcollid);
READ_ENUM_FIELD(coercionformat, CoercionForm);
READ_LOCATION_FIELD(location);
@@ -1068,8 +1703,18 @@ _readCoerceToDomainValue(void)
{
READ_LOCALS(CoerceToDomainValue);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(typeId);
+ else
+#endif
READ_OID_FIELD(typeId);
READ_INT_FIELD(typeMod);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(collation);
+ else
+#endif
READ_OID_FIELD(collation);
READ_LOCATION_FIELD(location);
@@ -1084,8 +1729,18 @@ _readSetToDefault(void)
{
READ_LOCALS(SetToDefault);
+#ifdef XCP
+ if (portable_input)
+ READ_TYPID_FIELD(typeId);
+ else
+#endif
READ_OID_FIELD(typeId);
READ_INT_FIELD(typeMod);
+#ifdef XCP
+ if (portable_input)
+ READ_COLLID_FIELD(collation);
+ else
+#endif
READ_OID_FIELD(collation);
READ_LOCATION_FIELD(location);
@@ -1119,6 +1774,11 @@ _readTargetEntry(void)
READ_INT_FIELD(resno);
READ_STRING_FIELD(resname);
READ_UINT_FIELD(ressortgroupref);
+#ifdef XCP
+ if (portable_input)
+ READ_RELID_FIELD(resorigtbl);
+ else
+#endif
READ_OID_FIELD(resorigtbl);
READ_INT_FIELD(resorigcol);
READ_BOOL_FIELD(resjunk);
@@ -1191,12 +1851,19 @@ _readRangeTblEntry(void)
READ_NODE_FIELD(eref);
READ_ENUM_FIELD(rtekind, RTEKind);
#ifdef PGXC
+#ifndef XCP
READ_STRING_FIELD(relname);
#endif
+#endif
switch (local_node->rtekind)
{
case RTE_RELATION:
+#ifdef XCP
+ if (portable_input)
+ READ_RELID_FIELD(relid);
+ else
+#endif
READ_OID_FIELD(relid);
READ_CHAR_FIELD(relkind);
break;
@@ -1240,6 +1907,16 @@ _readRangeTblEntry(void)
READ_BOOL_FIELD(inh);
READ_BOOL_FIELD(inFromCl);
READ_UINT_FIELD(requiredPerms);
+#ifdef XCP
+ if (portable_input)
+ {
+ local_node->requiredPerms = 0; /* no permission checks on data node */
+ token = pg_strtok(&length); /* skip :fldname */ \
+ token = pg_strtok(&length); /* skip field value */ \
+ local_node->checkAsUser = InvalidOid;
+ }
+ else
+#endif
READ_OID_FIELD(checkAsUser);
READ_BITMAPSET_FIELD(selectedCols);
READ_BITMAPSET_FIELD(modifiedCols);
@@ -1248,6 +1925,1310 @@ _readRangeTblEntry(void)
}
+#ifdef XCP
+/*
+ * _readPlan
+ */
+static Plan *
+_readPlan(void)
+{
+ READ_PLAN_FIELDS(Plan);
+
+ READ_DONE();
+}
+
+
+
+/*
+ * _readResult
+ */
+static Result *
+_readResult(void)
+{
+ READ_PLAN_FIELDS(Result);
+
+ READ_NODE_FIELD(resconstantqual);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readModifyTable
+ */
+static ModifyTable *
+_readModifyTable(void)
+{
+ READ_PLAN_FIELDS(ModifyTable);
+
+ READ_ENUM_FIELD(operation, CmdType);
+ READ_BOOL_FIELD(canSetTag);
+ READ_NODE_FIELD(resultRelations);
+ READ_INT_FIELD(resultRelIndex);
+ READ_NODE_FIELD(plans);
+ READ_NODE_FIELD(returningLists);
+ READ_NODE_FIELD(rowMarks);
+ READ_INT_FIELD(epqParam);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readAppend
+ */
+static Append *
+_readAppend(void)
+{
+ READ_PLAN_FIELDS(Append);
+
+ READ_NODE_FIELD(appendplans);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readMergeAppend
+ */
+static MergeAppend *
+_readMergeAppend(void)
+{
+ int i;
+ READ_PLAN_FIELDS(MergeAppend);
+
+ READ_NODE_FIELD(mergeplans);
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :sortColIdx */
+ local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->sortColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :sortOperators */
+ local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->sortOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->sortOperators[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :collations */
+ local_node->collations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *collname; /* collation name */
+ int collencoding; /* collation encoding */
+ /* the token is already read */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get collname */
+ collname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get nargs */
+ collencoding = atoi(token);
+ if (collname)
+ local_node->collations[i] = get_collid(collname,
+ collencoding,
+ NSP_OID(nspname));
+ else
+ local_node->collations[i] = InvalidOid;
+ }
+ else
+ local_node->collations[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :nullsFirst */
+ local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->nullsFirst[i] = strtobool(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readRecursiveUnion
+ */
+static RecursiveUnion *
+_readRecursiveUnion(void)
+{
+ int i;
+ READ_PLAN_FIELDS(RecursiveUnion);
+
+ READ_INT_FIELD(wtParam);
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :dupColIdx */
+ local_node->dupColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->dupColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :dupOperators */
+ local_node->dupOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->dupOperators[i] = atooid(token);
+ }
+
+ READ_LONG_FIELD(numGroups);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readBitmapAnd
+ */
+static BitmapAnd *
+_readBitmapAnd(void)
+{
+ READ_PLAN_FIELDS(BitmapAnd);
+
+ READ_NODE_FIELD(bitmapplans);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readBitmapOr
+ */
+static BitmapOr *
+_readBitmapOr(void)
+{
+ READ_PLAN_FIELDS(BitmapOr);
+
+ READ_NODE_FIELD(bitmapplans);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readScan
+ */
+static Scan *
+_readScan(void)
+{
+ READ_SCAN_FIELDS(Scan);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readSeqScan
+ */
+static SeqScan *
+_readSeqScan(void)
+{
+ READ_SCAN_FIELDS(SeqScan);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readIndexScan
+ */
+static IndexScan *
+_readIndexScan(void)
+{
+ READ_SCAN_FIELDS(IndexScan);
+
+ if (portable_input)
+ READ_RELID_FIELD(indexid);
+ else
+ READ_OID_FIELD(indexid);
+ READ_NODE_FIELD(indexqual);
+ READ_NODE_FIELD(indexqualorig);
+ READ_NODE_FIELD(indexorderby);
+ READ_NODE_FIELD(indexorderbyorig);
+ READ_ENUM_FIELD(indexorderdir, ScanDirection);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readIndexOnlyScan
+ */
+static IndexOnlyScan *
+_readIndexOnlyScan(void)
+{
+ READ_SCAN_FIELDS(IndexOnlyScan);
+
+ if (portable_input)
+ READ_RELID_FIELD(indexid);
+ else
+ READ_OID_FIELD(indexid);
+ READ_NODE_FIELD(indexqual);
+ READ_NODE_FIELD(indexorderby);
+ READ_NODE_FIELD(indextlist);
+ READ_ENUM_FIELD(indexorderdir, ScanDirection);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readBitmapIndexScan
+ */
+static BitmapIndexScan *
+_readBitmapIndexScan(void)
+{
+ READ_SCAN_FIELDS(BitmapIndexScan);
+
+ if (portable_input)
+ READ_RELID_FIELD(indexid);
+ else
+ READ_OID_FIELD(indexid);
+ READ_NODE_FIELD(indexqual);
+ READ_NODE_FIELD(indexqualorig);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readBitmapHeapScan
+ */
+static BitmapHeapScan *
+_readBitmapHeapScan(void)
+{
+ READ_SCAN_FIELDS(BitmapHeapScan);
+
+ READ_NODE_FIELD(bitmapqualorig);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readTidScan
+ */
+static TidScan *
+_readTidScan(void)
+{
+ READ_SCAN_FIELDS(TidScan);
+
+ READ_NODE_FIELD(tidquals);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readSubqueryScan
+ */
+static SubqueryScan *
+_readSubqueryScan(void)
+{
+ READ_SCAN_FIELDS(SubqueryScan);
+
+ READ_NODE_FIELD(subplan);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readFunctionScan
+ */
+static FunctionScan *
+_readFunctionScan(void)
+{
+ READ_SCAN_FIELDS(FunctionScan);
+
+ READ_NODE_FIELD(funcexpr);
+ READ_NODE_FIELD(funccolnames);
+ READ_NODE_FIELD(funccoltypes);
+ READ_NODE_FIELD(funccoltypmods);
+ READ_NODE_FIELD(funccolcollations);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readValuesScan
+ */
+static ValuesScan *
+_readValuesScan(void)
+{
+ READ_SCAN_FIELDS(ValuesScan);
+
+ READ_NODE_FIELD(values_lists);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readCteScan
+ */
+static CteScan *
+_readCteScan(void)
+{
+ READ_SCAN_FIELDS(CteScan);
+
+ READ_INT_FIELD(ctePlanId);
+ READ_INT_FIELD(cteParam);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readWorkTableScan
+ */
+static WorkTableScan *
+_readWorkTableScan(void)
+{
+ READ_SCAN_FIELDS(WorkTableScan);
+
+ READ_INT_FIELD(wtParam);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readJoin
+ */
+static Join *
+_readJoin(void)
+{
+ READ_JOIN_FIELDS(Join);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readNestLoop
+ */
+static NestLoop *
+_readNestLoop(void)
+{
+ READ_JOIN_FIELDS(NestLoop);
+
+ READ_NODE_FIELD(nestParams);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readMergeJoin
+ */
+static MergeJoin *
+_readMergeJoin(void)
+{
+ int numCols;
+ int i;
+ READ_JOIN_FIELDS(MergeJoin);
+
+ READ_NODE_FIELD(mergeclauses);
+ numCols = list_length(local_node->mergeclauses);
+
+
+ token = pg_strtok(&length); /* skip :mergeFamilies */
+ local_node->mergeFamilies = (Oid *) palloc(numCols * sizeof(Oid));
+ for (i = 0; i < numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->mergeFamilies[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :mergeCollations */
+ local_node->mergeCollations = (Oid *) palloc(numCols * sizeof(Oid));
+ for (i = 0; i < numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *collname; /* collation name */
+ int collencoding; /* collation encoding */
+ /* the token is already read */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get collname */
+ collname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get nargs */
+ collencoding = atoi(token);
+ if (collname)
+ local_node->mergeCollations[i] = get_collid(collname,
+ collencoding,
+ NSP_OID(nspname));
+ else
+ local_node->mergeCollations[i] = InvalidOid;
+ }
+ else
+ local_node->mergeCollations[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :mergeStrategies */
+ local_node->mergeStrategies = (int *) palloc(numCols * sizeof(int));
+ for (i = 0; i < numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->mergeStrategies[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :mergeNullsFirst */
+ local_node->mergeNullsFirst = (bool *) palloc(numCols * sizeof(bool));
+ for (i = 0; i < numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->mergeNullsFirst[i] = strtobool(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readHashJoin
+ */
+static HashJoin *
+_readHashJoin(void)
+{
+ READ_JOIN_FIELDS(HashJoin);
+
+ READ_NODE_FIELD(hashclauses);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readMaterial
+ */
+static Material *
+_readMaterial(void)
+{
+ READ_PLAN_FIELDS(Material);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readSort
+ */
+static Sort *
+_readSort(void)
+{
+ int i;
+ READ_PLAN_FIELDS(Sort);
+
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :sortColIdx */
+ local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->sortColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :sortOperators */
+ local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->sortOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->sortOperators[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :collations */
+ local_node->collations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *collname; /* collation name */
+ int collencoding; /* collation encoding */
+ /* the token is already read */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get collname */
+ collname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get nargs */
+ collencoding = atoi(token);
+ if (collname)
+ local_node->collations[i] = get_collid(collname,
+ collencoding,
+ NSP_OID(nspname));
+ else
+ local_node->collations[i] = InvalidOid;
+ }
+ else
+ local_node->collations[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :nullsFirst */
+ local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->nullsFirst[i] = strtobool(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readGroup
+ */
+static Group *
+_readGroup(void)
+{
+ int i;
+ READ_PLAN_FIELDS(Group);
+
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :grpColIdx */
+ local_node->grpColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->grpColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :grpOperators */
+ local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->grpOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->grpOperators[i] = atooid(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readAgg
+ */
+static Agg *
+_readAgg(void)
+{
+ int i;
+ READ_PLAN_FIELDS(Agg);
+
+ READ_ENUM_FIELD(aggstrategy, AggStrategy);
+ READ_ENUM_FIELD(aggdistribution, AggDistribution);
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :grpColIdx */
+ local_node->grpColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->grpColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :grpOperators */
+ local_node->grpOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->grpOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->grpOperators[i] = atooid(token);
+ }
+
+ READ_LONG_FIELD(numGroups);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readWindowAgg
+ */
+static WindowAgg *
+_readWindowAgg(void)
+{
+ int i;
+ READ_PLAN_FIELDS(WindowAgg);
+
+ READ_INT_FIELD(winref);
+ READ_INT_FIELD(partNumCols);
+
+ token = pg_strtok(&length); /* skip :partColIdx */
+ local_node->partColIdx = (AttrNumber *) palloc(local_node->partNumCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->partNumCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->partColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :partOperators */
+ local_node->partOperators = (Oid *) palloc(local_node->partNumCols * sizeof(Oid));
+ for (i = 0; i < local_node->partNumCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->partOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->partOperators[i] = atooid(token);
+ }
+
+ READ_INT_FIELD(ordNumCols);
+
+ token = pg_strtok(&length); /* skip :ordColIdx */
+ local_node->ordColIdx = (AttrNumber *) palloc(local_node->ordNumCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->ordNumCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->ordColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :ordOperators */
+ local_node->ordOperators = (Oid *) palloc(local_node->ordNumCols * sizeof(Oid));
+ for (i = 0; i < local_node->ordNumCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->ordOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->ordOperators[i] = atooid(token);
+ }
+
+ READ_INT_FIELD(frameOptions);
+ READ_NODE_FIELD(startOffset);
+ READ_NODE_FIELD(endOffset);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readUnique
+ */
+static Unique *
+_readUnique(void)
+{
+ int i;
+ READ_PLAN_FIELDS(Unique);
+
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :uniqColIdx */
+ local_node->uniqColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->uniqColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :uniqOperators */
+ local_node->uniqOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->uniqOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->uniqOperators[i] = atooid(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readHash
+ */
+static Hash *
+_readHash(void)
+{
+ READ_PLAN_FIELDS(Hash);
+
+ if (portable_input)
+ READ_RELID_FIELD(skewTable);
+ else
+ READ_OID_FIELD(skewTable);
+ READ_INT_FIELD(skewColumn);
+ READ_BOOL_FIELD(skewInherit);
+ if (portable_input)
+ READ_TYPID_FIELD(skewColType);
+ else
+ READ_OID_FIELD(skewColType);
+ READ_INT_FIELD(skewColTypmod);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readSetOp
+ */
+static SetOp *
+_readSetOp(void)
+{
+ int i;
+ READ_PLAN_FIELDS(SetOp);
+
+ READ_ENUM_FIELD(cmd, SetOpCmd);
+ READ_ENUM_FIELD(strategy, SetOpStrategy);
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :dupColIdx */
+ local_node->dupColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->dupColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :dupOperators */
+ local_node->dupOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->dupOperators[i] = atooid(token);
+ }
+
+ READ_INT_FIELD(flagColIdx);
+ READ_INT_FIELD(firstFlag);
+ READ_LONG_FIELD(numGroups);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readLimit
+ */
+static Limit *
+_readLimit(void)
+{
+ READ_PLAN_FIELDS(Limit);
+
+ READ_NODE_FIELD(limitOffset);
+ READ_NODE_FIELD(limitCount);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readRemoteSubplan
+ */
+static RemoteSubplan *
+_readRemoteSubplan(void)
+{
+ READ_SCAN_FIELDS(RemoteSubplan);
+
+ READ_CHAR_FIELD(distributionType);
+ READ_INT_FIELD(distributionKey);
+ READ_NODE_FIELD(distributionNodes);
+ READ_NODE_FIELD(distributionRestrict);
+ READ_NODE_FIELD(nodeList);
+ READ_BOOL_FIELD(execOnAll);
+ READ_NODE_FIELD(sort);
+ READ_STRING_FIELD(cursor);
+ READ_INT_FIELD(unique);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readRemoteStmt
+ */
+static RemoteStmt *
+_readRemoteStmt(void)
+{
+ int i;
+ READ_LOCALS(RemoteStmt);
+
+ READ_ENUM_FIELD(commandType, CmdType);
+ READ_BOOL_FIELD(hasReturning);
+ READ_NODE_FIELD(planTree);
+ READ_NODE_FIELD(rtable);
+ READ_NODE_FIELD(resultRelations);
+ READ_NODE_FIELD(subplans);
+ READ_INT_FIELD(nParamExec);
+ READ_INT_FIELD(nParamRemote);
+ if (local_node->nParamRemote > 0)
+ {
+ local_node->remoteparams = (RemoteParam *) palloc(
+ local_node->nParamRemote * sizeof(RemoteParam));
+ for (i = 0; i < local_node->nParamRemote; i++)
+ {
+ RemoteParam *rparam = &(local_node->remoteparams[i]);
+ token = pg_strtok(&length); /* skip :paramkind */
+ token = pg_strtok(&length);
+ rparam->paramkind = (ParamKind) atoi(token);
+
+ token = pg_strtok(&length); /* skip :paramid */
+ token = pg_strtok(&length);
+ rparam->paramid = atoi(token);
+
+ token = pg_strtok(&length); /* skip :paramtype */
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *typname; /* data type name */
+ token = pg_strtok(&length); /* get nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get typname */
+ typname = nullable_string(token, length);
+ if (typname)
+ rparam->paramtype = get_typname_typid(typname,
+ NSP_OID(nspname));
+ else
+ rparam->paramtype = InvalidOid;
+ }
+ else
+ {
+ token = pg_strtok(&length);
+ rparam->paramtype = atooid(token);
+ }
+ }
+ }
+ else
+ local_node->remoteparams = NULL;
+
+ READ_NODE_FIELD(rowMarks);
+ READ_CHAR_FIELD(distributionType);
+ READ_INT_FIELD(distributionKey);
+ READ_NODE_FIELD(distributionNodes);
+ READ_NODE_FIELD(distributionRestrict);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readSimpleSort
+ */
+static SimpleSort *
+_readSimpleSort(void)
+{
+ int i;
+ READ_LOCALS(SimpleSort);
+
+ READ_INT_FIELD(numCols);
+
+ token = pg_strtok(&length); /* skip :sortColIdx */
+ local_node->sortColIdx = (AttrNumber *) palloc(local_node->numCols * sizeof(AttrNumber));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->sortColIdx[i] = atoi(token);
+ }
+
+ token = pg_strtok(&length); /* skip :sortOperators */
+ local_node->sortOperators = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *oprname; /* operator name */
+ char *leftnspname; /* left type namespace */
+ char *leftname; /* left type name */
+ Oid oprleft; /* left type */
+ char *rightnspname; /* right type namespace */
+ char *rightname; /* right type name */
+ Oid oprright; /* right type */
+ /* token is already set to nspname */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get operator name */
+ oprname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type namespace */
+ leftnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* left type name */
+ leftname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type namespace */
+ rightnspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* right type name */
+ rightname = nullable_string(token, length);
+ if (leftname)
+ oprleft = get_typname_typid(leftname,
+ NSP_OID(leftnspname));
+ else
+ oprleft = InvalidOid;
+ if (rightname)
+ oprright = get_typname_typid(rightname,
+ NSP_OID(rightnspname));
+ else
+ oprright = InvalidOid;
+ local_node->sortOperators[i] = get_operid(oprname,
+ oprleft,
+ oprright,
+ NSP_OID(nspname));
+ }
+ else
+ local_node->sortOperators[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :sortCollations */
+ local_node->sortCollations = (Oid *) palloc(local_node->numCols * sizeof(Oid));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ if (portable_input)
+ {
+ char *nspname; /* namespace name */
+ char *collname; /* collation name */
+ int collencoding; /* collation encoding */
+ /* the token is already read */
+ nspname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get collname */
+ collname = nullable_string(token, length);
+ token = pg_strtok(&length); /* get nargs */
+ collencoding = atoi(token);
+ if (collname)
+ local_node->sortCollations[i] = get_collid(collname,
+ collencoding,
+ NSP_OID(nspname));
+ else
+ local_node->sortCollations[i] = InvalidOid;
+ }
+ else
+ local_node->sortCollations[i] = atooid(token);
+ }
+
+ token = pg_strtok(&length); /* skip :nullsFirst */
+ local_node->nullsFirst = (bool *) palloc(local_node->numCols * sizeof(bool));
+ for (i = 0; i < local_node->numCols; i++)
+ {
+ token = pg_strtok(&length);
+ local_node->nullsFirst[i] = strtobool(token);
+ }
+
+ READ_DONE();
+}
+
+
+/*
+ * _readNestLoopParam
+ */
+static NestLoopParam *
+_readNestLoopParam(void)
+{
+ READ_LOCALS(NestLoopParam);
+
+ READ_INT_FIELD(paramno);
+ READ_NODE_FIELD(paramval);
+
+ READ_DONE();
+}
+
+
+/*
+ * _readPlanRowMark
+ */
+static PlanRowMark *
+_readPlanRowMark(void)
+{
+ READ_LOCALS(PlanRowMark);
+
+ READ_UINT_FIELD(rti);
+ READ_UINT_FIELD(prti);
+ READ_UINT_FIELD(rowmarkId);
+ READ_ENUM_FIELD(markType, RowMarkType);
+ READ_BOOL_FIELD(noWait);
+ READ_BOOL_FIELD(isParent);
+
+ READ_DONE();
+}
+
+/*
+ * _readLockRows
+ */
+static LockRows *
+_readLockRows(void)
+{
+ READ_PLAN_FIELDS(LockRows);
+
+ READ_NODE_FIELD(rowMarks);
+ READ_INT_FIELD(epqParam);
+
+ READ_DONE();
+}
+
+#endif /* XCP */
+
+
/*
* parseNodeString
*
@@ -1314,6 +3295,10 @@ parseNodeString(void)
return_value = _readBoolExpr();
else if (MATCH("SUBLINK", 7))
return_value = _readSubLink();
+#ifdef XCP
+ else if (MATCH("SUBPLAN", 7))
+ return_value = _readSubPlan();
+#endif
else if (MATCH("FIELDSELECT", 11))
return_value = _readFieldSelect();
else if (MATCH("FIELDSTORE", 10))
@@ -1372,6 +3357,86 @@ parseNodeString(void)
return_value = _readNotifyStmt();
else if (MATCH("DECLARECURSOR", 13))
return_value = _readDeclareCursorStmt();
+#ifdef XCP
+ else if (MATCH("PLAN", 4))
+ return_value = _readPlan();
+ else if (MATCH("RESULT", 6))
+ return_value = _readResult();
+ else if (MATCH("MODIFYTABLE", 11))
+ return_value = _readModifyTable();
+ else if (MATCH("APPEND", 6))
+ return_value = _readAppend();
+ else if (MATCH("MERGEAPPEND", 11))
+ return_value = _readMergeAppend();
+ else if (MATCH("RECURSIVEUNION", 14))
+ return_value = _readRecursiveUnion();
+ else if (MATCH("BITMAPAND", 9))
+ return_value = _readBitmapAnd();
+ else if (MATCH("BITMAPOR", 8))
+ return_value = _readBitmapOr();
+ else if (MATCH("SCAN", 4))
+ return_value = _readScan();
+ else if (MATCH("SEQSCAN", 7))
+ return_value = _readSeqScan();
+ else if (MATCH("INDEXSCAN", 9))
+ return_value = _readIndexScan();
+ else if (MATCH("INDEXONLYSCAN", 13))
+ return_value = _readIndexOnlyScan();
+ else if (MATCH("BITMAPINDEXSCAN", 15))
+ return_value = _readBitmapIndexScan();
+ else if (MATCH("BITMAPHEAPSCAN", 14))
+ return_value = _readBitmapHeapScan();
+ else if (MATCH("TIDSCAN", 7))
+ return_value = _readTidScan();
+ else if (MATCH("SUBQUERYSCAN", 12))
+ return_value = _readSubqueryScan();
+ else if (MATCH("FUNCTIONSCAN", 12))
+ return_value = _readFunctionScan();
+ else if (MATCH("VALUESSCAN", 10))
+ return_value = _readValuesScan();
+ else if (MATCH("CTESCAN", 7))
+ return_value = _readCteScan();
+ else if (MATCH("WORKTABLESCAN", 13))
+ return_value = _readWorkTableScan();
+ else if (MATCH("JOIN", 4))
+ return_value = _readJoin();
+ else if (MATCH("NESTLOOP", 8))
+ return_value = _readNestLoop();
+ else if (MATCH("MERGEJOIN", 9))
+ return_value = _readMergeJoin();
+ else if (MATCH("HASHJOIN", 8))
+ return_value = _readHashJoin();
+ else if (MATCH("MATERIAL", 8))
+ return_value = _readMaterial();
+ else if (MATCH("SORT", 4))
+ return_value = _readSort();
+ else if (MATCH("GROUP", 5))
+ return_value = _readGroup();
+ else if (MATCH("AGG", 3))
+ return_value = _readAgg();
+ else if (MATCH("WINDOWAGG", 9))
+ return_value = _readWindowAgg();
+ else if (MATCH("UNIQUE", 6))
+ return_value = _readUnique();
+ else if (MATCH("HASH", 4))
+ return_value = _readHash();
+ else if (MATCH("SETOP", 5))
+ return_value = _readSetOp();
+ else if (MATCH("LIMIT", 5))
+ return_value = _readLimit();
+ else if (MATCH("REMOTESUBPLAN", 13))
+ return_value = _readRemoteSubplan();
+ else if (MATCH("REMOTESTMT", 10))
+ return_value = _readRemoteStmt();
+ else if (MATCH("SIMPLESORT", 10))
+ return_value = _readSimpleSort();
+ else if (MATCH("NESTLOOPPARAM", 13))
+ return_value = _readNestLoopParam();
+ else if (MATCH("PLANROWMARK", 11))
+ return_value = _readPlanRowMark();
+ else if (MATCH("LOCKROWS", 8))
+ return_value = _readLockRows();
+#endif
else
{
elog(ERROR, "badly formatted node string \"%.32s\"...", token);
@@ -1445,3 +3510,49 @@ readDatum(bool typbyval)
return res;
}
+
+#ifdef XCP
+/*
+ * scanDatum
+ *
+ * Recreate Datum from the text format understandable by the input function
+ * of the specified data type.
+ */
+static Datum
+scanDatum(Oid typid, int typmod)
+{
+ Oid typInput;
+ Oid typioparam;
+ FmgrInfo finfo;
+ FunctionCallInfoData fcinfo;
+ char *value;
+ Datum res;
+ READ_TEMP_LOCALS();
+
+ /* Get input function for the type */
+ getTypeInputInfo(typid, &typInput, &typioparam);
+ fmgr_info(typInput, &finfo);
+
+ /* Read the value */
+ token = pg_strtok(&length);
+ value = nullable_string(token, length);
+
+ /* The value can not be NULL, so we actually received empty string */
+ if (value == NULL)
+ value = "";
+
+ /* Invoke input function */
+ InitFunctionCallInfoData(fcinfo, &finfo, 3, InvalidOid, NULL, NULL);
+
+ fcinfo.arg[0] = CStringGetDatum(value);
+ fcinfo.arg[1] = ObjectIdGetDatum(typioparam);
+ fcinfo.arg[2] = Int32GetDatum(typmod);
+ fcinfo.argnull[0] = false;
+ fcinfo.argnull[1] = false;
+ fcinfo.argnull[2] = false;
+
+ res = FunctionCallInvoke(&fcinfo);
+
+ return res;
+}
+#endif
diff --git a/src/backend/optimizer/path/Makefile b/src/backend/optimizer/path/Makefile
index 0d9ffe58a7..07938dbe57 100644
--- a/src/backend/optimizer/path/Makefile
+++ b/src/backend/optimizer/path/Makefile
@@ -13,7 +13,6 @@ top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = allpaths.o clausesel.o costsize.o equivclass.o indxpath.o \
- joinpath.o joinrels.o orindxpath.o pathkeys.o tidpath.o \
- pgxcpath.o
+ joinpath.o joinrels.o orindxpath.o pathkeys.o tidpath.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 1331de75fc..525a659007 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -3,6 +3,11 @@
* allpaths.c
* Routines to find possible search paths for processing a query
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -36,6 +41,16 @@
#include "optimizer/var.h"
#include "parser/parse_clause.h"
#include "parser/parsetree.h"
+#ifdef PGXC
+#ifdef XCP
+#include "nodes/makefuncs.h"
+#include "miscadmin.h"
+#else
+#include "catalog/pg_namespace.h"
+#include "catalog/pg_class.h"
+#include "pgxc/pgxc.h"
+#endif /* XCP */
+#endif /* PGXC */
#include "rewrite/rewriteManip.h"
#include "utils/lsyscache.h"
@@ -378,9 +393,22 @@ static void
set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
{
#ifdef PGXC
- if (!create_plainrel_rqpath(root, rel, rte))
+#ifndef XCP
+ /*
+ * If we are on the Coordinator, we always want to use
+ * the remote query path unless it is a pg_catalog table
+ * or a sequence relation.
+ */
+ if (IS_PGXC_COORDINATOR &&
+ !IsConnFromCoord() &&
+ get_rel_namespace(rte->relid) != PG_CATALOG_NAMESPACE &&
+ get_rel_relkind(rte->relid) != RELKIND_SEQUENCE &&
+ !root->parse->is_local)
+ add_path(rel, create_remotequery_path(root, rel));
+ else
{
-#endif
+#endif /* XCP */
+#endif /* PGXC */
/* Consider sequential scan */
add_path(rel, create_seqscan_path(root, rel, NULL));
@@ -391,8 +419,10 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
/* Consider TID scans */
create_tidscan_paths(root, rel);
#ifdef PGXC
+#ifndef XCP
}
-#endif
+#endif /* XCP */
+#endif /* PGXC */
/* Now find the cheapest of the paths for this rel */
set_cheapest(rel);
@@ -1035,6 +1065,9 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
double tuple_fraction;
PlannerInfo *subroot;
List *pathkeys;
+#ifdef XCP
+ Distribution *distribution;
+#endif
/*
* Must copy the Query so that planning doesn't mess up the RTE contents
@@ -1144,7 +1177,53 @@ set_subquery_pathlist(PlannerInfo *root, RelOptInfo *rel,
pathkeys = convert_subquery_pathkeys(root, rel, subroot->query_pathkeys);
/* Generate appropriate path */
+#ifdef XCP
+ if (subroot->distribution && subroot->distribution->distributionExpr)
+ {
+ ListCell *lc;
+ /*
+ * The distribution expression from the subplan's tlist, but it should
+ * be from the rel, need conversion.
+ */
+ distribution = makeNode(Distribution);
+ distribution->distributionType = subroot->distribution->distributionType;
+ distribution->nodes = bms_copy(subroot->distribution->nodes);
+ distribution->restrictNodes = bms_copy(subroot->distribution->restrictNodes);
+ foreach(lc, rel->subplan->targetlist)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+ if (equal(tle->expr, subroot->distribution->distributionExpr))
+ {
+ distribution->distributionExpr = (Node *)
+ makeVarFromTargetEntry(rel->relid, tle);
+ break;
+ }
+ }
+ }
+ else
+ distribution = subroot->distribution;
+ add_path(rel, create_subqueryscan_path(root, rel, pathkeys, NULL,
+ distribution));
+
+ /*
+ * Temporarily block ORDER BY in subqueries until we can add support
+ * it in Postgres-XL without outputting incorrect results. Should
+ * do this only in normal processing mode though!
+ *
+ * The extra conditions below try to handle cases where an ORDER BY
+ * appears in a simple VIEW or INSERT SELECT.
+ */
+ if (IsUnderPostmaster &&
+ list_length(subquery->sortClause) > 1
+ && (subroot->parent_root != root
+ || (subroot->parent_root == root
+ && (root->parse->commandType != CMD_SELECT
+ || (root->parse->commandType == CMD_SELECT
+ && root->parse->hasWindowFuncs)))))
+ elog(ERROR, "Postgres-XL does not currently support ORDER BY in subqueries");
+#else
add_path(rel, create_subqueryscan_path(root, rel, pathkeys, NULL));
+#endif
/* Select cheapest path (pretty easy in this case...) */
set_cheapest(rel);
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 0f5f72d9a3..ba71c15594 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -57,6 +57,11 @@
* values.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -97,6 +102,10 @@ double random_page_cost = DEFAULT_RANDOM_PAGE_COST;
double cpu_tuple_cost = DEFAULT_CPU_TUPLE_COST;
double cpu_index_tuple_cost = DEFAULT_CPU_INDEX_TUPLE_COST;
double cpu_operator_cost = DEFAULT_CPU_OPERATOR_COST;
+#ifdef XCP
+double network_byte_cost = DEFAULT_NETWORK_BYTE_COST;
+double remote_query_cost = DEFAULT_REMOTE_QUERY_COST;
+#endif
int effective_cache_size = DEFAULT_EFFECTIVE_CACHE_SIZE;
@@ -114,11 +123,8 @@ bool enable_material = true;
bool enable_mergejoin = true;
bool enable_hashjoin = true;
#ifdef PGXC
-bool enable_fast_query_shipping = true;
bool enable_remotejoin = true;
bool enable_remotegroup = true;
-bool enable_remotesort = true;
-bool enable_remotelimit = true;
#endif
typedef struct
@@ -2242,6 +2248,15 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path,
relation_byte_size(inner_path_rows, inner_path->parent->width) >
(work_mem * 1024L))
path->materialize_inner = true;
+#ifdef XCP
+ /*
+ * Even if innersortkeys are specified, we never add the Sort node on top
+ * of RemoteSubplan, instead we set up internal sorter.
+ * Since RemoteSubplan does not support mark/restore we must materialize it
+ */
+ else if (inner_path->pathtype == T_RemoteSubplan)
+ path->materialize_inner = true;
+#endif
else
path->materialize_inner = false;
@@ -2850,22 +2865,6 @@ cost_rescan(PlannerInfo *root, Path *path,
}
}
-#ifdef PGXC
-/*
- * cost_remotequery
- * As of now the function just sets the costs to 0 to make this path the
- * cheapest.
- * PGXC_TODO: Ideally, we should estimate the costs of network transfer from
- * datanodes and any datanode costs involved.
- */
-void
-cost_remotequery(RemoteQueryPath *rqpath, PlannerInfo *root, RelOptInfo *rel)
-{
- rqpath->path.startup_cost = 0;
- rqpath->path.total_cost = 0;
- rqpath->path.rows = rel->rows;
-}
-#endif /* PGXC */
/*
* cost_qual_eval
@@ -4032,3 +4031,30 @@ page_size(double tuples, int width)
{
return ceil(relation_byte_size(tuples, width) / BLCKSZ);
}
+
+
+#ifdef XCP
+void
+cost_remote_subplan(Path *path,
+ Cost input_startup_cost, Cost input_total_cost,
+ double tuples, int width, int replication)
+{
+ Cost startup_cost = input_startup_cost + remote_query_cost;
+ Cost run_cost = input_total_cost - input_startup_cost;
+
+ path->rows = tuples;
+
+ /*
+ * Charge 2x cpu_operator_cost per tuple to reflect bookkeeping overhead.
+ */
+ run_cost += 2 * cpu_operator_cost * tuples;
+
+ /*
+ * Estimate cost of sending data over network
+ */
+ run_cost += network_byte_cost * tuples * width * replication;
+
+ path->startup_cost = startup_cost;
+ path->total_cost = startup_cost + run_cost;
+}
+#endif
diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c
index 0463ec92b4..65f86194e1 100644
--- a/src/backend/optimizer/path/joinpath.c
+++ b/src/backend/optimizer/path/joinpath.c
@@ -196,16 +196,6 @@ add_paths_to_joinrel(PlannerInfo *root,
hash_inner_and_outer(root, joinrel, outerrel, innerrel,
restrictlist, jointype,
sjinfo, &semifactors, param_source_rels);
-
-#ifdef PGXC
- /*
- * If the inner and outer relations have RemoteQuery paths, check if this
- * JOIN can be pushed to the data-nodes. If so, create a RemoteQuery path
- * corresponding to the this JOIN.
- */
- create_joinrel_rqpath(root, joinrel, outerrel, innerrel, restrictlist,
- jointype, sjinfo);
-#endif /* PGXC */
}
/*
@@ -1338,4 +1328,3 @@ select_mergejoin_clauses(PlannerInfo *root,
return result_list;
}
-
diff --git a/src/backend/optimizer/plan/Makefile b/src/backend/optimizer/plan/Makefile
index 759a669ef5..88a9f7ff8c 100644
--- a/src/backend/optimizer/plan/Makefile
+++ b/src/backend/optimizer/plan/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = analyzejoins.o createplan.o initsplan.o planagg.o planmain.o planner.o \
- setrefs.o subselect.o pgxcplan.o
+ setrefs.o subselect.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 0c9c52e685..88a6ca1eae 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -5,6 +5,11 @@
* Planning is complete, we just need to convert the selected
* Path into a Plan.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -37,6 +42,29 @@
#include "optimizer/var.h"
#include "parser/parse_clause.h"
#include "parser/parsetree.h"
+#ifdef PGXC
+#include "access/gtm.h"
+#include "parser/parse_coerce.h"
+#include "pgxc/pgxc.h"
+#include "pgxc/planner.h"
+#include "pgxc/postgresql_fdw.h"
+#include "access/sysattr.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "executor/executor.h"
+#ifdef XCP
+#include "access/gtm.h"
+#include "catalog/pg_aggregate.h"
+#include "parser/parse_coerce.h"
+#else
+#include "rewrite/rewriteManip.h"
+#endif /* XCP */
+#include "commands/prepare.h"
+#include "commands/tablecmds.h"
+#endif /* PGXC */
#include "utils/lsyscache.h"
@@ -52,6 +80,13 @@ static Plan *create_merge_append_plan(PlannerInfo *root, MergeAppendPath *best_p
static Result *create_result_plan(PlannerInfo *root, ResultPath *best_path);
static Material *create_material_plan(PlannerInfo *root, MaterialPath *best_path);
static Plan *create_unique_plan(PlannerInfo *root, UniquePath *best_path);
+#ifdef XCP
+static void adjustSubplanDistribution(PlannerInfo *root, Distribution *pathd,
+ Distribution *subd);
+static RemoteSubplan *create_remotescan_plan(PlannerInfo *root,
+ RemoteSubPath *best_path);
+static char *get_internal_cursor(void);
+#endif
static SeqScan *create_seqscan_plan(PlannerInfo *root, Path *best_path,
List *tlist, List *scan_clauses);
static Scan *create_indexscan_plan(PlannerInfo *root, IndexPath *best_path,
@@ -73,6 +108,32 @@ static CteScan *create_ctescan_plan(PlannerInfo *root, Path *best_path,
List *tlist, List *scan_clauses);
static WorkTableScan *create_worktablescan_plan(PlannerInfo *root, Path *best_path,
List *tlist, List *scan_clauses);
+#ifdef PGXC
+#ifndef XCP
+static RowMarkClause *mk_row_mark_clause(PlanRowMark *prm);
+static bool compare_alias(Alias *a1, Alias *a2);
+static Plan *create_remotequery_plan(PlannerInfo *root, Path *best_path,
+ List *tlist, List *scan_clauses);
+static Plan *create_remotejoin_plan(PlannerInfo *root, JoinPath *best_path,
+ Plan *parent, Plan *outer_plan, Plan *inner_plan);
+static List *create_remote_target_list(PlannerInfo *root,
+ StringInfo targets, List *out_tlist, List *in_tlist,
+ char *out_alias, int out_index,
+ char *in_alias, int in_index);
+static Alias *generate_remote_rte_alias(RangeTblEntry *rte, int varno,
+ char *aliasname, int reduce_level);
+static void pgxc_locate_grouping_columns(PlannerInfo *root, List *tlist,
+ AttrNumber *grpColIdx);
+static List *pgxc_process_grouping_targetlist(PlannerInfo *root,
+ List **local_tlist);
+static List *pgxc_process_having_clause(PlannerInfo *root, List *remote_tlist,
+ Node *havingQual, List **local_qual,
+ List **remote_qual, bool *reduce_plan);
+static Expr *pgxc_set_en_expr(Oid tableoid, Index resultRelationIndex);
+static int pgxc_count_rowmarks_entries(List *rowMarks);
+static Oid *pgxc_build_rowmark_entries(List *rowMarks, List *rtable, Oid *types, int prepparams, int totparams);
+#endif /* XCP */
+#endif /* PGXC */
static ForeignScan *create_foreignscan_plan(PlannerInfo *root, ForeignPath *best_path,
List *tlist, List *scan_clauses);
static NestLoop *create_nestloop_plan(PlannerInfo *root, NestPath *best_path,
@@ -120,6 +181,12 @@ static CteScan *make_ctescan(List *qptlist, List *qpqual,
Index scanrelid, int ctePlanId, int cteParam);
static WorkTableScan *make_worktablescan(List *qptlist, List *qpqual,
Index scanrelid, int wtParam);
+#ifdef PGXC
+#ifndef XCP
+static RemoteQuery *make_remotequery(List *qptlist, List *qpqual,
+ Index scanrelid);
+#endif
+#endif
static BitmapAnd *make_bitmap_and(List *bitmapplans);
static BitmapOr *make_bitmap_or(List *bitmapplans);
static NestLoop *make_nestloop(List *tlist,
@@ -165,6 +232,22 @@ static EquivalenceMember *find_ec_member_for_tle(EquivalenceClass *ec,
Relids relids);
static Material *make_material(Plan *lefttree);
+#ifdef PGXC
+#ifndef XCP
+static void findReferencedVars(List *parent_vars, RemoteQuery *plan, List **out_tlist, Relids *out_relids);
+static void create_remote_clause_expr(PlannerInfo *root, Plan *parent, StringInfo clauses,
+ List *qual, RemoteQuery *scan);
+static void create_remote_expr(PlannerInfo *root, Plan *parent, StringInfo expr,
+ Node *node, RemoteQuery *scan);
+#endif /* XCP */
+#endif /* PGXC */
+
+#ifdef XCP
+static int add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll,
+ bool nulls_first,int numCols, AttrNumber *sortColIdx,
+ Oid *sortOperators, Oid *collations, bool *nullsFirst);
+#endif
+
/*
* create_plan
* Creates the access plan for a query by recursively processing the
@@ -188,6 +271,11 @@ create_plan(PlannerInfo *root, Path *best_path)
/* Initialize this module's private workspace in PlannerInfo */
root->curOuterRels = NULL;
root->curOuterParams = NIL;
+#ifdef XCP
+ root->curOuterRestrict = NULL;
+ adjustSubplanDistribution(root, root->distribution,
+ best_path->distribution);
+#endif
/* Recursively process the path tree */
plan = create_plan_recurse(root, best_path);
@@ -221,8 +309,19 @@ create_plan_recurse(PlannerInfo *root, Path *best_path)
case T_CteScan:
case T_WorkTableScan:
case T_ForeignScan:
+#ifdef PGXC
+#ifndef XCP
+ case T_RemoteQuery:
+#endif /* XCP */
+#endif /* PGXC */
plan = create_scan_plan(root, best_path);
break;
+#ifdef XCP
+ case T_RemoteSubplan:
+ plan = (Plan *) create_remotescan_plan(root,
+ (RemoteSubPath *) best_path);
+ break;
+#endif
case T_HashJoin:
case T_MergeJoin:
case T_NestLoop:
@@ -249,12 +348,6 @@ create_plan_recurse(PlannerInfo *root, Path *best_path)
plan = create_unique_plan(root,
(UniquePath *) best_path);
break;
-#ifdef PGXC
- case T_RemoteQuery:
- plan = create_remotequery_plan(root,
- (RemoteQueryPath *)best_path);
- break;
-#endif
default:
elog(ERROR, "unrecognized node type: %d",
(int) best_path->pathtype);
@@ -394,6 +487,19 @@ create_scan_plan(PlannerInfo *root, Path *best_path)
scan_clauses);
break;
+#ifdef PGXC
+#ifndef XCP
+ case T_RemoteQuery:
+ /* For RemoteQuery path always use relation tlist */
+ tlist = build_relation_tlist(rel);
+ plan = (Plan *) create_remotequery_plan(root,
+ best_path,
+ tlist,
+ scan_clauses);
+ break;
+#endif /* XCP */
+#endif /* PGXC */
+
case T_ForeignScan:
plan = (Plan *) create_foreignscan_plan(root,
(ForeignPath *) best_path,
@@ -644,9 +750,642 @@ create_join_plan(PlannerInfo *root, JoinPath *best_path)
list_concat(get_qpqual((Plan) plan),
get_actual_clauses(get_loc_restrictinfo(best_path))));
#endif
+
+#ifdef PGXC
+#ifndef XCP
+ /*
+ * Check if this join can be reduced to an equiv. remote scan node
+ * This can only be executed on a remote Coordinator
+ */
+ if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
+ plan = create_remotejoin_plan(root, best_path, plan, outer_plan, inner_plan);
+#endif /* XCP */
+#endif /* PGXC */
+
return plan;
}
+
+#ifdef PGXC
+#ifndef XCP
+/*
+ * create_remotejoin_plan
+ * check if the children plans involve remote entities from the same remote
+ * node. If so, this join can be reduced to an equivalent remote scan plan
+ * node
+ *
+ * RULES:
+ *
+ * * provide unique aliases to both inner and outer nodes to represent their
+ * corresponding subqueries
+ *
+ * * identify target entries from both inner and outer that appear in the join
+ * targetlist, only those need to be selected from these aliased subqueries
+ *
+ * * a join node has a joinqual list which represents the join condition. E.g.
+ * SELECT * from emp e LEFT JOIN emp2 d ON e.x = d.x
+ * Here the joinqual contains "e.x = d.x". If the joinqual itself has a local
+ * dependency, e.g "e.x = localfunc(d.x)", then this join cannot be reduced
+ *
+ * * other than the joinqual, the join node can contain additional quals. Even
+ * if they have any local dependencies, we can reduce the join and just
+ * append these quals into the reduced remote scan node. We DO do a pass to
+ * identify remote quals and ship those in the squery though
+ *
+ * * these quals (both joinqual and normal quals with no local dependencies)
+ * need to be converted into expressions referring to the aliases assigned to
+ * the nodes. These expressions will eventually become part of the squery of
+ * the reduced remote scan node
+ *
+ * * the children remote scan nodes themselves can have local dependencies in
+ * their quals (the remote ones are already part of the squery). We can still
+ * reduce the join and just append these quals into the reduced remote scan
+ * node
+ *
+ * * if we reached successfully so far, generate a new remote scan node with
+ * this new squery generated using the aliased references
+ *
+ * One important point to note here about targetlists is that this function
+ * does not set any DUMMY var references in the Var nodes appearing in it. It
+ * follows the standard mechanism as is followed by other nodes. Similar to the
+ * existing nodes, the references which point to DUMMY vars is done in
+ * set_remote_references() function in set_plan_references phase at the fag
+ * end. Avoiding such DUMMY references manipulations till the end also makes
+ * this code a lot much readable and easier.
+ */
+static Plan *
+create_remotejoin_plan(PlannerInfo *root, JoinPath *best_path, Plan *parent, Plan *outer_plan, Plan *inner_plan)
+{
+ NestLoop *nest_parent;
+ ExecNodes *join_exec_nodes;
+ RemoteQuery *outer;
+ RemoteQuery *inner;
+
+ if (!enable_remotejoin)
+ return parent;
+
+ /* meh, what are these for :( */
+ if (root->hasPseudoConstantQuals)
+ return parent;
+
+ /* do not optimize CURSOR based select statements */
+ if (root->parse->rowMarks != NIL)
+ return parent;
+
+ /*
+ * optimize only simple NestLoop joins for now. Other joins like Merge and
+ * Hash can be reduced too. But they involve additional intermediate nodes
+ * and we need to understand them a bit more as yet
+ */
+ if (!IsA(parent, NestLoop))
+ return parent;
+ else
+ nest_parent = (NestLoop *)parent;
+
+ if (!IsA(outer_plan, RemoteQuery) || !IsA(inner_plan, RemoteQuery))
+ return parent;
+
+ outer = (RemoteQuery *)outer_plan;
+ inner = (RemoteQuery *)inner_plan;
+
+ /* check if both the nodes qualify for reduction */
+ if (!outer->scan.plan.qual && !inner->scan.plan.qual)
+ {
+ int i;
+ List *rtable_list = NIL;
+ List *parent_vars, *out_tlist = NIL, *in_tlist = NIL, *base_tlist;
+ Relids out_relids = NULL, in_relids = NULL;
+
+ /*
+ * Check if both these plans are from the same remote node. If yes,
+ * replace this JOIN along with it's two children with one equivalent
+ * remote node
+ */
+
+ /*
+ * Build up rtable for XC Walker
+ * (was not sure I could trust this, but it seems to work in various cases)
+ */
+ for (i = 0; i < root->simple_rel_array_size; i++)
+ {
+ RangeTblEntry *rte = root->simple_rte_array[i];
+
+ /* Check for NULL first, sometimes it is NULL at position 0 */
+ if (rte)
+ rtable_list = lappend(rtable_list, root->simple_rte_array[i]);
+ }
+ /*
+ * Walk the left, right trees and identify which vars appear in the
+ * parent targetlist, only those need to be selected. Note that
+ * depending on whether the parent targetlist is top-level or
+ * intermediate, the children vars may or may not be referenced
+ * multiple times in it.
+ */
+ parent_vars = pull_var_clause((Node *)parent->targetlist,
+ PVC_RECURSE_AGGREGATES,
+ PVC_RECURSE_PLACEHOLDERS);
+
+ findReferencedVars(parent_vars, outer, &out_tlist, &out_relids);
+ findReferencedVars(parent_vars, inner, &in_tlist, &in_relids);
+
+ join_exec_nodes = IsJoinReducible(inner, outer, in_relids, out_relids,
+ &(nest_parent->join),
+ best_path, root->parse->rtable);
+ /* XXX Check if the join optimization is possible */
+ if (join_exec_nodes)
+ {
+ RemoteQuery *result;
+ Plan *result_plan;
+ StringInfoData targets, clauses, scan_clauses, fromlist, join_condition;
+ StringInfoData squery;
+ ListCell *l;
+ char in_alias[15], out_alias[15];
+ bool use_where = false;
+ Index dummy_rtindex;
+ RangeTblEntry *dummy_rte;
+ List *local_scan_clauses = NIL, *remote_scan_clauses = NIL;
+ char *pname;
+ List *colnames;
+
+
+ /* KISS! As long as distinct aliases are provided for all the objects in
+ * involved in query, remote server should not crib! */
+ sprintf(in_alias, "out_%d", root->rs_alias_index);
+ sprintf(out_alias, "in_%d", root->rs_alias_index);
+
+ /*
+ * If the JOIN ON clause has a local dependency then we cannot ship
+ * the join to the remote side at all, bail out immediately.
+ */
+ if (!pgxc_is_expr_shippable((Expr *)nest_parent->join.joinqual, NULL))
+ {
+ elog(DEBUG1, "cannot reduce: local dependencies in the joinqual");
+ return parent;
+ }
+
+ /*
+ * If the normal plan qual has local dependencies, the join can
+ * still be shipped. Try harder to ship remote clauses out of the
+ * entire list. These local quals will become part of the quals
+ * list of the reduced remote scan node down later.
+ */
+ if (!pgxc_is_expr_shippable((Expr *)nest_parent->join.plan.qual, NULL))
+ {
+ elog(DEBUG1, "local dependencies in the join plan qual");
+
+ /*
+ * trawl through each entry and come up with remote and local
+ * clauses... sigh
+ */
+ foreach(l, nest_parent->join.plan.qual)
+ {
+ Node *clause = lfirst(l);
+
+ /*
+ * if the currentof in the above call to
+ * clause_is_local_bound is set, somewhere in the list there
+ * is currentof clause, so keep that information intact and
+ * pass a dummy argument here.
+ */
+ if (!pgxc_is_expr_shippable((Expr *)clause, NULL))
+ local_scan_clauses = lappend(local_scan_clauses, clause);
+ else
+ remote_scan_clauses = lappend(remote_scan_clauses, clause);
+ }
+ }
+ else
+ {
+ /*
+ * there is no local bound clause, all the clauses are remote
+ * scan clauses
+ */
+ remote_scan_clauses = nest_parent->join.plan.qual;
+ }
+
+ /* generate the tlist for the new RemoteScan node using out_tlist, in_tlist */
+ initStringInfo(&targets);
+ colnames = create_remote_target_list(root, &targets, out_tlist, in_tlist,
+ out_alias, outer->reduce_level, in_alias, inner->reduce_level);
+
+ /*
+ * generate the fromlist now. The code has to appropriately mention
+ * the JOIN type in the string being generated.
+ */
+ initStringInfo(&fromlist);
+ appendStringInfo(&fromlist, " (%s) %s ",
+ outer->sql_statement, quote_identifier(out_alias));
+
+ use_where = false;
+ switch (nest_parent->join.jointype)
+ {
+ case JOIN_INNER:
+ pname = ", ";
+ use_where = true;
+ break;
+ case JOIN_LEFT:
+ pname = "LEFT JOIN";
+ break;
+ case JOIN_FULL:
+ pname = "FULL JOIN";
+ break;
+ case JOIN_RIGHT:
+ pname = "RIGHT JOIN";
+ break;
+ case JOIN_SEMI:
+ case JOIN_ANTI:
+ default:
+ return parent;
+ }
+
+ /*
+ * splendid! we can actually replace this join hierarchy with a
+ * single RemoteScan node now. Start off by constructing the
+ * appropriate new tlist and tupdescriptor
+ */
+ result = makeNode(RemoteQuery);
+
+ /*
+ * Save various information about the inner and the outer plans. We
+ * may need this information later if more entries are added to it
+ * as part of the remote expression optimization
+ */
+ result->read_only = true;
+ result->inner_alias = pstrdup(in_alias);
+ result->outer_alias = pstrdup(out_alias);
+ result->inner_reduce_level = inner->reduce_level;
+ result->outer_reduce_level = outer->reduce_level;
+ result->inner_relids = in_relids;
+ result->outer_relids = out_relids;
+ result->inner_statement = pstrdup(inner->sql_statement);
+ result->outer_statement = pstrdup(outer->sql_statement);
+ result->join_condition = NULL;
+ result->exec_nodes = join_exec_nodes;
+ result->is_temp = inner->is_temp || outer->is_temp;
+
+ appendStringInfo(&fromlist, " %s (%s) %s",
+ pname, inner->sql_statement, quote_identifier(in_alias));
+
+ /* generate join.joinqual remote clause string representation */
+ initStringInfo(&clauses);
+ if (nest_parent->join.joinqual != NIL)
+ {
+ create_remote_clause_expr(root, parent, &clauses,
+ nest_parent->join.joinqual, result);
+ }
+
+ /* generate join.plan.qual remote clause string representation */
+ initStringInfo(&scan_clauses);
+ if (remote_scan_clauses != NIL)
+ {
+ create_remote_clause_expr(root, parent, &scan_clauses,
+ remote_scan_clauses, result);
+ }
+
+ /*
+ * set the base tlist of the involved base relations, useful in
+ * set_plan_refs later. Additionally the tupledescs should be
+ * generated using this base_tlist and not the parent targetlist.
+ * This is because we want to take into account any additional
+ * column references from the scan clauses too
+ */
+ base_tlist = add_to_flat_tlist(NIL, list_concat(out_tlist, in_tlist));
+
+ /*
+ * Create and append the dummy range table entry to the range table.
+ * Note that this modifies the master copy the caller passed us, otherwise
+ * e.g EXPLAIN VERBOSE will fail to find the rte the Vars built below refer
+ * to.
+ */
+ dummy_rte = make_dummy_remote_rte("__REMOTE_JOIN_QUERY__",
+ makeAlias("__REMOTE_JOIN_QUERY__", colnames));
+ root->parse->rtable = lappend(root->parse->rtable, dummy_rte);
+ dummy_rtindex = list_length(root->parse->rtable);
+
+ result_plan = &result->scan.plan;
+
+ /* Set the join targetlist to the new base_tlist */
+ result_plan->targetlist = parent->targetlist;
+ result_plan->lefttree = NULL;
+ result_plan->righttree = NULL;
+ result->scan.scanrelid = dummy_rtindex;
+
+ /* generate the squery for this node */
+
+ /* NOTE: it's assumed that the remote_paramNums array is
+ * filled in the same order as we create the query here.
+ *
+ * TODO: we need some way to ensure that the remote_paramNums
+ * is filled in the same order as the order in which the clauses
+ * are added in the query below.
+ */
+ initStringInfo(&squery);
+ appendStringInfo(&squery, "SELECT %s FROM %s", targets.data, fromlist.data);
+
+ initStringInfo(&join_condition);
+ if (clauses.data[0] != '\0')
+ appendStringInfo(&join_condition, " %s %s", use_where? " WHERE " : " ON ", clauses.data);
+
+ if (scan_clauses.data[0] != '\0')
+ appendStringInfo(&join_condition, " %s %s", use_where? " AND " : " WHERE ", scan_clauses.data);
+
+ if (join_condition.data[0] != '\0')
+ appendStringInfoString(&squery, join_condition.data);
+
+ result->sql_statement = squery.data;
+ result->join_condition = join_condition.data;
+ /* don't forget to increment the index for the next time around! */
+ result->reduce_level = root->rs_alias_index++;
+
+
+ /* set_plan_refs needs this later */
+ result->base_tlist = base_tlist;
+
+ /*
+ * if there were any local scan clauses stick them up here. They
+ * can come from the join node or from remote scan node themselves.
+ * Because of the processing being done earlier in
+ * create_remotescan_plan, all of the clauses if present will be
+ * local ones and hence can be stuck without checking for
+ * remoteness again here into result_plan->qual
+ */
+ result_plan->qual = list_concat(result_plan->qual, outer_plan->qual);
+ result_plan->qual = list_concat(result_plan->qual, inner_plan->qual);
+ result_plan->qual = list_concat(result_plan->qual, local_scan_clauses);
+
+ /* we actually need not worry about costs since this is the final plan */
+ result_plan->startup_cost = outer_plan->startup_cost;
+ result_plan->total_cost = outer_plan->total_cost;
+ result_plan->plan_rows = outer_plan->plan_rows;
+ result_plan->plan_width = outer_plan->plan_width;
+
+ return (Plan *)result_plan;
+ }
+ }
+
+ return parent;
+}
+
+/*
+ * Generate aliases for columns of remote tables using the
+ * colname_varno_varattno_reduce_level nomenclature
+ */
+static Alias *
+generate_remote_rte_alias(RangeTblEntry *rte, int varno, char *aliasname, int reduce_level)
+{
+ int maxattrs;
+ int varattno;
+ List *colnames = NIL;
+ StringInfo attr = makeStringInfo();
+ Relation relation;
+
+ if (rte->rtekind != RTE_RELATION)
+ elog(ERROR, "called in improper context");
+
+ relation = heap_open(rte->relid, AccessShareLock);
+
+ maxattrs = RelationGetNumberOfAttributes(relation);
+
+ for (varattno = 0; varattno < maxattrs; varattno++)
+ {
+ char *attname = get_rte_attribute_name(rte, varattno + 1);
+
+ if (reduce_level == 0)
+ {
+ /*
+ * Even if reduce level is 0, we still need to copy column aliases
+ * from rte because we don't want to loose any user-supplied table
+ * column aliases, in case any.
+ */
+ colnames = lappend(colnames, makeString(pstrdup((attname))));
+ }
+ else
+ {
+ resetStringInfo(attr);
+ appendStringInfo(attr, "%s_%d_%d_%d",
+ attname, varno, varattno + 1, reduce_level);
+ colnames = lappend(colnames, makeString(pstrdup(attr->data)));
+ }
+
+ }
+
+ heap_close(relation, AccessShareLock);
+
+ return makeAlias(aliasname, colnames);
+}
+
+/* create_remote_target_list
+ * generate a targetlist using out_alias and in_alias appropriately. It is
+ * possible that in case of multiple-hierarchy reduction, both sides can have
+ * columns with the same name. E.g. consider the following:
+ *
+ * select * from emp e join emp f on e.x = f.x, emp g;
+ *
+ * So if we just use new_alias.columnname it can
+ * very easily clash with other columnname from the same side of an already
+ * reduced join. To avoid this, we generate unique column aliases using the
+ * following convention:
+ * colname_varno_varattno_reduce_level_index
+ *
+ * Each RemoteScan node carries it's reduce_level index to indicate the
+ * convention that should be adopted while referring to it's columns. If the
+ * level is 0, then normal column names can be used because they will never
+ * clash at the join level
+ */
+static List *
+create_remote_target_list(PlannerInfo *root, StringInfo targets, List *out_tlist, List *in_tlist,
+ char *out_alias, int out_index, char *in_alias, int in_index)
+{
+ int i = 0;
+ ListCell *l;
+ StringInfo attrname = makeStringInfo();
+ bool add_null_target = true;
+ List *colnames = NIL;
+
+ foreach(l, out_tlist)
+ {
+ Var *var = (Var *) lfirst(l);
+ RangeTblEntry *rte = planner_rt_fetch(var->varno, root);
+ char *attname;
+
+
+ if (i++ > 0)
+ appendStringInfo(targets, ", ");
+
+ attname = get_rte_attribute_name(rte, var->varattno);
+
+ if (out_index)
+ {
+ resetStringInfo(attrname);
+ /* varattno can be negative for sys attributes, hence the abs! */
+ appendStringInfo(attrname, "%s_%d_%d_%d",
+ attname, var->varno, abs(var->varattno), out_index);
+ appendStringInfo(targets, "%s.%s",
+ quote_identifier(out_alias), quote_identifier(attrname->data));
+ }
+ else
+ appendStringInfo(targets, "%s.%s",
+ quote_identifier(out_alias), quote_identifier(attname));
+
+ /* generate the new alias now using root->rs_alias_index */
+ resetStringInfo(attrname);
+ appendStringInfo(attrname, "%s_%d_%d_%d",
+ attname, var->varno, abs(var->varattno), root->rs_alias_index);
+ appendStringInfo(targets, " AS %s", quote_identifier(attrname->data));
+ colnames = lappend(colnames, makeString(pstrdup(attrname->data)));
+ add_null_target = false;
+ }
+
+ foreach(l, in_tlist)
+ {
+ Var *var = (Var *) lfirst(l);
+ RangeTblEntry *rte = planner_rt_fetch(var->varno, root);
+ char *attname;
+
+ if (i++ > 0)
+ appendStringInfo(targets, ", ");
+
+ attname = get_rte_attribute_name(rte, var->varattno);
+
+ if (in_index)
+ {
+ resetStringInfo(attrname);
+ /* varattno can be negative for sys attributes, hence the abs! */
+ appendStringInfo(attrname, "%s_%d_%d_%d",
+ attname, var->varno, abs(var->varattno), in_index);
+ appendStringInfo(targets, "%s.%s",
+ quote_identifier(in_alias), quote_identifier(attrname->data));
+ }
+ else
+ appendStringInfo(targets, "%s.%s",
+ quote_identifier(in_alias), quote_identifier(attname));
+
+ /* generate the new alias now using root->rs_alias_index */
+ resetStringInfo(attrname);
+ appendStringInfo(attrname, "%s_%d_%d_%d",
+ attname, var->varno, abs(var->varattno), root->rs_alias_index);
+ appendStringInfo(targets, " AS %s", quote_identifier(attrname->data));
+ colnames = lappend(colnames, makeString(pstrdup(attrname->data)));
+ add_null_target = false;
+ }
+
+ /*
+ * It's possible that in some cases, the targetlist might not refer to any
+ * vars from the joined relations, eg.
+ * select count(*) from t1, t2; select const from t1, t2; etc
+ * For such cases just add a NULL selection into this targetlist
+ */
+ if (add_null_target)
+ appendStringInfo(targets, " NULL ");
+ return colnames;
+}
+
+/*
+ * create_remote_clause_expr
+ * generate a string to represent the clause list expression using out_alias
+ * and in_alias references. This function does a cute hack by temporarily
+ * modifying the rte->eref entries of the involved relations to point to
+ * out_alias and in_alias appropriately. The deparse_expression call then
+ * generates a string using these erefs which is exactly what is desired here.
+ *
+ * Additionally it creates aliases for the column references based on the
+ * reduce_level values too. This handles the case when both sides have same
+ * named columns..
+ *
+ * Obviously this function restores the eref, alias values to their former selves
+ * appropriately too, after use
+ */
+static void
+create_remote_clause_expr(PlannerInfo *root, Plan *parent, StringInfo clauses,
+ List *qual, RemoteQuery *scan)
+{
+ Node *node = (Node *) make_ands_explicit(qual);
+
+ return create_remote_expr(root, parent, clauses, node, scan);
+}
+
+static void
+create_remote_expr(PlannerInfo *root, Plan *parent, StringInfo expr,
+ Node *node, RemoteQuery *scan)
+{
+ List *context;
+ List *leref = NIL;
+ ListCell *cell;
+ char *exprstr;
+ int rtindex;
+ Relids tmprelids, relids;
+
+ relids = pull_varnos((Node *)node);
+
+ tmprelids = bms_copy(relids);
+
+ while ((rtindex = bms_first_member(tmprelids)) >= 0)
+ {
+ RangeTblEntry *rte = planner_rt_fetch(rtindex, root);
+
+ /*
+ * This rtindex should be a member of either out_relids or
+ * in_relids and never both
+ */
+ if (bms_is_member(rtindex, scan->outer_relids) &&
+ bms_is_member(rtindex, scan->inner_relids))
+ elog(ERROR, "improper relid references in the join clause list");
+
+ /*
+ * save the current rte->eref and rte->alias values and stick in a new
+ * one in the rte with the proper inner or outer alias
+ */
+ leref = lappend(leref, rte->eref);
+ leref = lappend(leref, rte->alias);
+
+ if (bms_is_member(rtindex, scan->outer_relids))
+ {
+ rte->eref = makeAlias(scan->outer_alias, NIL);
+
+ /* attach proper column aliases.. */
+ rte->alias = generate_remote_rte_alias(rte, rtindex,
+ scan->outer_alias, scan->outer_reduce_level);
+ }
+ if (bms_is_member(rtindex, scan->inner_relids))
+ {
+ rte->eref = makeAlias(scan->inner_alias, NIL);
+
+ /* attach proper column aliases.. */
+ rte->alias = generate_remote_rte_alias(rte, rtindex,
+ scan->inner_alias, scan->inner_reduce_level);
+ }
+ }
+ bms_free(tmprelids);
+
+ /* Set up deparsing context */
+ context = deparse_context_for_plan((Node *) parent,
+ NULL,
+ root->parse->rtable);
+
+ exprstr = deparse_expression(node, context, true, false);
+
+ /* revert back the saved eref entries in the same order now! */
+ cell = list_head(leref);
+ tmprelids = bms_copy(relids);
+ while ((rtindex = bms_first_member(tmprelids)) >= 0)
+ {
+ RangeTblEntry *rte = planner_rt_fetch(rtindex, root);
+
+ Assert(cell != NULL);
+
+ rte->eref = lfirst(cell);
+ cell = lnext(cell);
+
+ rte->alias = lfirst(cell);
+ cell = lnext(cell);
+ }
+ bms_free(tmprelids);
+
+ appendStringInfo(expr, " %s", exprstr);
+ return;
+}
+#endif /* XCP */
+#endif /* PGXC */
+
/*
* create_append_plan
* Create an Append plan for 'best_path' and (recursively) plans
@@ -924,6 +1663,14 @@ create_unique_plan(PlannerInfo *root, UniquePath *best_path)
subplan = (Plan *) make_result(root, newtlist, NULL, subplan);
else
subplan->targetlist = newtlist;
+#ifdef XCP
+ /*
+ * RemoteSubplan is conditionally projection capable - it is pushing
+ * projection to the data nodes
+ */
+ if (IsA(subplan, RemoteSubplan))
+ subplan->lefttree->targetlist = newtlist;
+#endif
}
/*
@@ -1045,6 +1792,140 @@ create_unique_plan(PlannerInfo *root, UniquePath *best_path)
}
+#ifdef XCP
+/*
+ * adjustSubplanDistribution
+ * Make sure the distribution of the subplan is matching to the consumers.
+ */
+static void
+adjustSubplanDistribution(PlannerInfo *root, Distribution *pathd,
+ Distribution *subd)
+{
+ /* Replace path restriction with actual */
+ if (pathd && !bms_is_empty(root->curOuterRestrict))
+ {
+ bms_free(pathd->restrictNodes);
+ pathd->restrictNodes = bms_copy(root->curOuterRestrict);
+ }
+
+ root->curOuterRestrict = NULL;
+
+ /*
+ * Set new restriction for the subpath
+ * Do not restrict if distributions are equal, they are going to be merged
+ * and subplan will be executed on caller nodes.
+ * However if there are upper query levels caller's distribution may be
+ * adjusted.
+ */
+ if (subd && !equal(subd, pathd))
+ {
+ /*
+ * If subpath is replicated without restriction choose one execution
+ * datanode and set it as current restriction.
+ */
+ if (IsLocatorReplicated(subd->distributionType) &&
+ bms_num_members(subd->restrictNodes) != 1)
+ {
+ Bitmapset *result = NULL;
+ Bitmapset *execute;
+ Bitmapset *common;
+ int node;
+
+ /*
+ * We should choose one of the distribution nodes, but we can save
+ * some network traffic if chosen execution node will be one of
+ * the result nodes at the same time.
+ */
+ if (pathd)
+ result = bms_is_empty(pathd->restrictNodes) ?
+ pathd->nodes : pathd->restrictNodes;
+ execute = bms_is_empty(subd->restrictNodes) ?
+ subd->nodes : subd->restrictNodes;
+ common = bms_intersect(result, execute);
+ if (bms_is_empty(common))
+ {
+ bms_free(common);
+ common = bms_copy(subd->nodes);
+ }
+
+ /*
+ * Check if any of the common nodes is preferred and choose one
+ * of the preferred
+ */
+ node = GetAnyDataNode(common);
+ bms_free(common);
+
+ /* set restriction for the subplan */
+ root->curOuterRestrict = bms_make_singleton(node);
+
+ /* replace execution restriction for the generated */
+ bms_free(subd->restrictNodes);
+ subd->restrictNodes = bms_make_singleton(node);
+ }
+ }
+}
+
+/*
+ * create_remotescan_plan
+ * Create a RemoteSubquery plan for 'best_path' and (recursively) plans
+ * for its subpaths.
+ *
+ * Returns a Plan node.
+ */
+static RemoteSubplan *
+create_remotescan_plan(PlannerInfo *root,
+ RemoteSubPath *best_path)
+{
+ RemoteSubplan *plan;
+ Plan *subplan;
+ Bitmapset *saverestrict;
+
+ /*
+ * Subsequent code will modify current restriction, it needs to be restored
+ * so other path nodes in the outer tree could see correct value.
+ */
+ saverestrict = root->curOuterRestrict;
+
+ adjustSubplanDistribution(root, best_path->path.distribution,
+ best_path->subpath->distribution);
+
+ subplan = create_plan_recurse(root, best_path->subpath);
+
+ /* We don't want any excess columns in the remote tuples */
+ disuse_physical_tlist(subplan, best_path->subpath);
+
+ plan = make_remotesubplan(root, subplan,
+ best_path->path.distribution,
+ best_path->subpath->distribution,
+ best_path->path.pathkeys);
+
+ copy_path_costsize(&plan->scan.plan, (Path *) best_path);
+
+ /* restore current restrict */
+ bms_free(root->curOuterRestrict);
+ root->curOuterRestrict = saverestrict;
+
+ return plan;
+}
+
+
+RemoteSubplan *
+find_push_down_plan(Plan *plan, bool force)
+{
+ if (IsA(plan, RemoteSubplan) &&
+ (force || (list_length(((RemoteSubplan *) plan)->nodeList) > 1 &&
+ ((RemoteSubplan *) plan)->execOnAll)))
+ return (RemoteSubplan *) plan;
+ if (IsA(plan, Hash) ||
+ IsA(plan, Material) ||
+ IsA(plan, Unique) ||
+ IsA(plan, Limit))
+ return find_push_down_plan(plan->lefttree, force);
+ return NULL;
+}
+#endif
+
+
/*****************************************************************************
*
* BASE-RELATION SCAN METHODS
@@ -1862,6 +2743,335 @@ create_worktablescan_plan(PlannerInfo *root, Path *best_path,
return scan_plan;
}
+
+#ifdef PGXC
+#ifndef XCP
+/*
+ * mk_row_mark_clause
+ * Given a PlanRowMark, create a corresponding RowMarkClause
+ */
+static RowMarkClause *
+mk_row_mark_clause(PlanRowMark *prm)
+{
+ RowMarkClause *rmc;
+
+ if (prm == NULL)
+ return NULL;
+
+ /* We are intrested in either FOR UPDATE or FOR SHARE */
+ if (prm->markType != ROW_MARK_EXCLUSIVE && prm->markType != ROW_MARK_SHARE)
+ return NULL;
+
+ rmc = makeNode(RowMarkClause);
+
+ /* Copy rti as is form the PlanRowMark */
+ rmc->rti = prm->rti;
+
+ /* Assume FOR SHARE unless compelled FOR UPDATE */
+ rmc->forUpdate = false;
+ if (prm->markType == ROW_MARK_EXCLUSIVE)
+ rmc->forUpdate = true;
+
+ /* Copy noWait as is form the PlanRowMark */
+ rmc->noWait = prm->noWait;
+
+ /* true or false does not matter since we will use the result only while deparsing */
+ rmc->pushedDown = false;
+
+ return rmc;
+}
+
+/*
+ * compare_alias
+ * Compare two aliases
+ */
+static bool
+compare_alias(Alias *a1, Alias *a2)
+{
+ if (a1 == NULL && a2 == NULL)
+ return true;
+
+ if (a1 == NULL && a2 != NULL)
+ return false;
+
+ if (a2 == NULL && a1 != NULL)
+ return false;
+
+ if (strcmp(a1->aliasname, a2->aliasname) == 0)
+ return true;
+
+ return false;
+}
+
+/*
+ * contains_only_vars(tlist)
+ * Return true only if each element of tlist is a target entry having Var node
+ * as its containing expression.
+ */
+static bool
+contains_only_vars(List *tlist)
+{
+ ListCell *l;
+
+ foreach(l, (List *) tlist)
+ {
+ Node *tle = lfirst(l);
+ if (nodeTag(tle) != T_TargetEntry)
+ return false;
+ else
+ {
+ Expr *expr = ((TargetEntry *) tle)->expr;
+ if (nodeTag(expr) != T_Var)
+ return false;
+ }
+ }
+ return true;
+}
+
+/*
+ * create_remotequery_plan
+ * Returns a remotequery plan for the base relation scanned by 'best_path'
+ * with restriction clauses 'scan_clauses' and targetlist 'tlist'.
+ */
+static Plan *
+create_remotequery_plan(PlannerInfo *root, Path *best_path,
+ List *tlist, List *scan_clauses)
+{
+ RemoteQuery *scan_plan;
+ Index scan_relid = best_path->parent->relid;
+ RangeTblEntry *rte;
+ List *remote_scan_clauses = NIL;
+ List *local_scan_clauses = NIL;
+ StringInfoData sql;
+ Query *query;
+ RangeTblRef *rtr;
+ List *varlist;
+ ListCell *varcell;
+ Node *tmp_node;
+ List *rmlist;
+ List *tvarlist;
+ bool tlist_is_simple;
+ List *base_tlist; /* the target list representing the
+ * result obtained from datanode
+ */
+ RangeTblEntry *dummy_rte; /* RTE for the remote query node being
+ * added.
+ */
+ Index dummy_rtindex;
+
+ Assert(scan_relid > 0);
+ Assert(best_path->parent->rtekind == RTE_RELATION);
+
+ /* Sort clauses into best execution order */
+ scan_clauses = order_qual_clauses(root, scan_clauses);
+ /* Reduce RestrictInfo list to bare expressions; ignore pseudoconstants */
+ scan_clauses = extract_actual_clauses(scan_clauses, false);
+
+ if (scan_clauses)
+ {
+ ListCell *l;
+
+ foreach(l, (List *)scan_clauses)
+ {
+ Node *clause = lfirst(l);
+
+ if (pgxc_is_expr_shippable((Expr *)clause, NULL))
+ remote_scan_clauses = lappend(remote_scan_clauses, clause);
+ else
+ local_scan_clauses = lappend(local_scan_clauses, clause);
+ }
+ }
+
+ /*
+ * The target list passed in may not contain the Vars required for
+ * evaluating the quals. Add those quals in the targetlist
+ */
+ tlist = add_to_flat_tlist(tlist, copyObject(pull_var_clause((Node *)local_scan_clauses,
+ PVC_RECURSE_AGGREGATES,
+ PVC_RECURSE_PLACEHOLDERS)));
+ tlist_is_simple = contains_only_vars(tlist);
+
+ /*
+ * Construct a Query structure for the query to be fired on the Datanodes
+ * and deparse it. Fields not set remain memzero'ed as set by makeNode.
+ */
+ rte = rt_fetch(scan_relid, root->parse->rtable);
+ Assert(rte->rtekind == RTE_RELATION);
+ /* Make a copy of RTE to be included in the new query structure */
+ rte = copyObject(rte);
+ /* This RTE should appear in FROM clause of the SQL statement constructed */
+ rte->inFromCl = true;
+
+ query = makeNode(Query);
+ query->commandType = CMD_SELECT;
+ query->rtable = list_make1(rte);
+ query->jointree = makeNode(FromExpr);
+
+ rtr = makeNode(RangeTblRef);
+ rtr->rtindex = list_length(query->rtable);
+ /* There can be only one table */
+ Assert(rtr->rtindex == 1);
+
+ query->jointree->fromlist = list_make1(rtr);
+ query->jointree->quals = (Node *)make_ands_explicit(copyObject(remote_scan_clauses));
+
+ /*
+ * RemoteQuery node cannot handle arbitrary expressions in the target list.
+ * So if the target list has any elements that are not plain Vars, we need
+ * to create a Result node above RemoteQuery, and assign a plain var tlist
+ * in RemoteQuery node, and Result node will handle the expressions. So if
+ * the passed-in tlist is not a simple vars tlist, derive one out of the
+ * tlist.
+ */
+ if (tlist_is_simple)
+ query->targetList = copyObject(tlist);
+ else
+ {
+ tvarlist = copyObject(pull_var_clause((Node *)tlist,
+ PVC_RECURSE_AGGREGATES,
+ PVC_RECURSE_PLACEHOLDERS));
+ query->targetList = add_to_flat_tlist(NIL, copyObject(tvarlist));
+ }
+
+ /*
+ * We are going to change the Var nodes in the target list to be sent to the
+ * datanode. We need the original tlist to establish the mapping of result
+ * obtained from the datanode in this plan. It will be saved in
+ * RemoteQuery->base_tlist. So, copy the target list before modifying it
+ */
+ base_tlist = copyObject(query->targetList);
+
+ /*
+ * Change the varno in Var nodes in the targetlist of the query to be shipped to the
+ * Datanode to 1, to match the rtable in the query. Do the same for Var
+ * nodes in quals.
+ */
+ varlist = list_concat(pull_var_clause((Node *)query->targetList,
+ PVC_RECURSE_AGGREGATES,
+ PVC_RECURSE_PLACEHOLDERS),
+ pull_var_clause((Node *)query->jointree->quals,
+ PVC_RECURSE_AGGREGATES,
+ PVC_RECURSE_PLACEHOLDERS));
+
+ foreach(varcell, varlist)
+ {
+ Var *var = lfirst(varcell);
+ if (var->varno != scan_relid)
+ elog(ERROR, "Single table scan can not handle vars from more than one relation");
+ var->varno = rtr->rtindex;
+ }
+ list_free(varlist);
+
+ /*
+ * Call fix_scan_expr to fix the PlaceHolderVars. This step is not needed if
+ * we construct the query at the time of execution.
+ */
+ tmp_node = pgxc_fix_scan_expr(root, (Node *)query->targetList, 0);
+ Assert(!tmp_node || IsA(tmp_node, List));
+ query->targetList = (List *)tmp_node;
+ tmp_node = pgxc_fix_scan_expr(root, (Node *)query->jointree->quals, 0);
+ query->jointree->quals = tmp_node;
+
+ /*
+ * Before deparsing the query we need to check whether there are any FOR UPDATE/SHARE clauses
+ * in the query that we need to propagate to Datanodes
+ */
+ rmlist = NULL;
+ if (root->xc_rowMarks != NULL)
+ {
+ ListCell *rmcell;
+
+ foreach(rmcell, root->xc_rowMarks)
+ {
+ PlanRowMark *prm = lfirst(rmcell);
+ RangeTblEntry *rte_in_rm;
+
+ /*
+ * One remote query node contains one table only, check to make sure that
+ * this row mark clause is referring to the same table that this remote
+ * query node is targeting.
+ */
+ rte_in_rm = rt_fetch(prm->rti, root->parse->rtable);
+ if (rte_in_rm->relid == rte->relid && compare_alias(rte->alias, rte_in_rm->alias))
+ {
+ RowMarkClause *rmc;
+
+ /*
+ * Change the range table index in the row mark clause to 1
+ * to match the rtable in the query
+ */
+ prm->rti = 1;
+
+ /* Come up with a Row Mark Clause given a Plan Row Mark */
+ rmc = mk_row_mark_clause(prm);
+
+ if (rmc != NULL)
+ {
+ /* Add this row mark clause to the list to be added in the query to deparse */
+ rmlist = lappend(rmlist, rmc);
+
+ /*
+ * Although we can have mutiple row mark clauses even for a single table
+ * but here we will have only one plan row mark clause per table
+ * The reason is that here we are talking about only FOR UPDATE & FOR SHARE
+ * If we have both FOR SHARE and FOR UPDATE mentioned for the same table
+ * FOR UPDATE takes priority over FOR SHARE and in effect we will have only one clause.
+ */
+ break;
+ }
+ }
+ }
+
+ /* copy the row mark clause list in the query to deparse */
+ query->rowMarks = rmlist;
+
+ /* If there is a row mark clause, set the flag for deprasing of the row mark clause */
+ if (rmlist != NULL)
+ query->hasForUpdate = true;
+ }
+ initStringInfo(&sql);
+ deparse_query(query, &sql, NIL);
+
+ if (rmlist != NULL)
+ list_free_deep(rmlist);
+
+ /*
+ * Create and append the dummy range table entry to the range table.
+ * Note that this modifies the master copy the caller passed us, otherwise
+ * e.g EXPLAIN VERBOSE will fail to find the rte the Vars built below refer
+ * to.
+ */
+ dummy_rte = make_dummy_remote_rte(get_rel_name(rte->relid),
+ makeAlias("_REMOTE_TABLE_QUERY_", NIL));
+ root->parse->rtable = lappend(root->parse->rtable, dummy_rte);
+ dummy_rtindex = list_length(root->parse->rtable);
+
+ scan_plan = make_remotequery(tlist, local_scan_clauses, dummy_rtindex);
+
+ /* Track if the remote query involves a temporary object */
+ scan_plan->is_temp = IsTempTable(rte->relid);
+ scan_plan->read_only = (query->commandType == CMD_SELECT && !query->hasForUpdate);
+ scan_plan->has_row_marks = query->hasForUpdate;
+ scan_plan->sql_statement = sql.data;
+ scan_plan->base_tlist = base_tlist;
+ scan_plan->exec_nodes = GetRelationNodesByQuals(rte->relid, rtr->rtindex,
+ query->jointree->quals,
+ RELATION_ACCESS_READ);
+ if (!scan_plan->exec_nodes)
+ elog(ERROR, "No distribution information found for relid %d", rte->relid);
+
+ copy_path_costsize(&scan_plan->scan.plan, best_path);
+
+ /* PGXCTODO - get better estimates */
+ scan_plan->scan.plan.plan_rows = 1000;
+
+ scan_plan->has_ins_child_sel_parent = root->parse->is_ins_child_sel_parent;
+
+ return (Plan *)scan_plan;
+}
+#endif /* XCP */
+#endif /* PGXC */
+
/*
* create_foreignscan_plan
* Returns a foreignscan plan for the base relation scanned by 'best_path'
@@ -2019,6 +3229,27 @@ create_nestloop_plan(PlannerInfo *root,
else
prev = cell;
}
+#ifdef XCP
+ /*
+ * While NestLoop is executed it rescans inner plan. We do not want to
+ * rescan RemoteSubplan and do not support it.
+ * So if inner_plan is a RemoteSubplan, materialize it.
+ */
+ if (IsA(inner_plan, RemoteSubplan))
+ {
+ Plan *matplan = (Plan *) make_material(inner_plan);
+
+ /*
+ * We assume the materialize will not spill to disk, and therefore
+ * charge just cpu_operator_cost per tuple. (Keep this estimate in
+ * sync with cost_mergejoin.)
+ */
+ copy_plan_costsize(matplan, inner_plan);
+ matplan->total_cost += cpu_operator_cost * matplan->plan_rows;
+
+ inner_plan = matplan;
+ }
+#endif
join_plan = make_nestloop(tlist,
joinclauses,
@@ -3286,6 +4517,382 @@ make_worktablescan(List *qptlist,
return node;
}
+
+#ifdef PGXC
+#ifndef XCP
+static RemoteQuery *
+make_remotequery(List *qptlist, List *qpqual, Index scanrelid)
+{
+ RemoteQuery *node = makeNode(RemoteQuery);
+ Plan *plan = &node->scan.plan;
+
+ /* cost should be inserted by caller */
+ plan->targetlist = qptlist;
+ plan->qual = qpqual;
+ plan->lefttree = NULL;
+ plan->righttree = NULL;
+ node->scan.scanrelid = scanrelid;
+ node->read_only = true;
+ node->has_row_marks = false;
+
+ return node;
+}
+#endif /* XCP */
+#endif /* PGXC */
+
+
+#ifdef XCP
+/*
+ * make_remotesubplan
+ * Create a RemoteSubplan node to execute subplan on remote nodes.
+ * leftree - the subplan which we want to push down to remote node.
+ * resultDistribution - the distribution of the remote result. May be NULL -
+ * results are coming to the invoking node
+ * execDistribution - determines how source data of the subplan are
+ * distributed, where we should send the subplan and how combine results.
+ * pathkeys - the remote subplan is sorted according to these keys, executor
+ * should perform merge sort of incoming tuples
+ */
+RemoteSubplan *
+make_remotesubplan(PlannerInfo *root,
+ Plan *lefttree,
+ Distribution *resultDistribution,
+ Distribution *execDistribution,
+ List *pathkeys)
+{
+ RemoteSubplan *node = makeNode(RemoteSubplan);
+ Plan *plan = &node->scan.plan;
+ Bitmapset *tmpset;
+ int nodenum;
+
+ /* Sanity checks */
+ Assert(!equal(resultDistribution, execDistribution));
+ Assert(!IsA(lefttree, RemoteSubplan));
+
+ if (resultDistribution)
+ {
+ node->distributionType = resultDistribution->distributionType;
+ node->distributionKey = InvalidAttrNumber;
+ if (resultDistribution->distributionExpr)
+ {
+ ListCell *lc;
+ Expr *expr;
+
+ /* XXX Is that correct to reference a column of different type? */
+ if (IsA(resultDistribution->distributionExpr, RelabelType))
+ expr = ((RelabelType *) resultDistribution->distributionExpr)->arg;
+ else
+ expr = (Expr *) resultDistribution->distributionExpr;
+
+ /* Find distribution expression in the target list */
+ foreach(lc, lefttree->targetlist)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+
+ if (equal(tle->expr, expr))
+ {
+ node->distributionKey = tle->resno;
+ break;
+ }
+ }
+
+ if (node->distributionKey == InvalidAttrNumber)
+ {
+ TargetEntry *newtle;
+
+ /* The expression is not found, need to add junk */
+ newtle = makeTargetEntry(expr,
+ list_length(lefttree->targetlist) + 1,
+ NULL,
+ true);
+
+ if (is_projection_capable_plan(lefttree))
+ {
+ /* Ok to modify subplan's target list */
+ lefttree->targetlist = lappend(lefttree->targetlist, newtle);
+ }
+ else
+ {
+ /* Use Result node to calculate expression */
+ List *newtlist = list_copy(lefttree->targetlist);
+ newtlist = lappend(newtlist, newtle);
+ lefttree = (Plan *) make_result(root, newtlist, NULL, lefttree);
+ }
+
+ node->distributionKey = newtle->resno;
+ }
+ }
+ /*
+ * The distributionNodes describes result distribution
+ */
+ tmpset = bms_copy(resultDistribution->nodes);
+ node->distributionNodes = NIL;
+ while ((nodenum = bms_first_member(tmpset)) >= 0)
+ node->distributionNodes = lappend_int(node->distributionNodes,
+ nodenum);
+ bms_free(tmpset);
+ /*
+ * The distributionRestrict defines the set of nodes where results are
+ * actually shipped. These are the nodes where upper level step
+ * is executed.
+ */
+ if (resultDistribution->restrictNodes)
+ {
+ tmpset = bms_copy(resultDistribution->restrictNodes);
+ node->distributionRestrict = NIL;
+ while ((nodenum = bms_first_member(tmpset)) >= 0)
+ node->distributionRestrict =
+ lappend_int(node->distributionRestrict, nodenum);
+ bms_free(tmpset);
+ }
+ else
+ node->distributionRestrict = list_copy(node->distributionNodes);
+ }
+ else
+ {
+ node->distributionType = LOCATOR_TYPE_NONE;
+ node->distributionKey = InvalidAttrNumber;
+ node->distributionNodes = NIL;
+ }
+ plan->qual = NIL;
+ plan->lefttree = lefttree;
+ plan->righttree = NULL;
+ copy_plan_costsize(plan, lefttree);
+ /* determine where subplan will be executed */
+ if (execDistribution)
+ {
+ if (execDistribution->restrictNodes)
+ tmpset = bms_copy(execDistribution->restrictNodes);
+ else
+ tmpset = bms_copy(execDistribution->nodes);
+ node->nodeList = NIL;
+ while ((nodenum = bms_first_member(tmpset)) >= 0)
+ node->nodeList = lappend_int(node->nodeList, nodenum);
+ bms_free(tmpset);
+ node->execOnAll = list_length(node->nodeList) == 1 ||
+ !IsLocatorReplicated(execDistribution->distributionType);
+ }
+ else
+ {
+ /*
+ * Prepare single execution of replicated subplan. Choose one node from
+ * the execution node list, preferrably the node is also a member of
+ * the list of result nodes, so later all node executors contact the
+ * same node to get tuples
+ */
+ tmpset = NULL;
+ if (!bms_is_empty(resultDistribution->restrictNodes))
+ tmpset = bms_copy(resultDistribution->restrictNodes);
+ else
+ tmpset = bms_copy(resultDistribution->nodes);
+ /*
+ * If result goes on single node execute subplan locally
+ */
+ if (bms_num_members(tmpset) > 1)
+ {
+ /* get one execution node TODO: load balancing */
+ nodenum = bms_first_member(tmpset);
+ node->nodeList = list_make1_int(nodenum);
+ node->execOnAll = true;
+ }
+ else
+ {
+ node->nodeList = NIL;
+ node->execOnAll = false;
+ }
+ bms_free(tmpset);
+ }
+ plan->targetlist = lefttree->targetlist;
+ /* We do not need to merge sort if only one node is yielding tuples */
+ if (pathkeys && node->execOnAll && list_length(node->nodeList) > 1)
+ {
+ List *tlist = lefttree->targetlist;
+ ListCell *i;
+ int numsortkeys;
+ AttrNumber *sortColIdx;
+ Oid *sortOperators;
+ Oid *collations;
+ bool *nullsFirst;
+
+ /*
+ * We will need at most list_length(pathkeys) sort columns; possibly less
+ */
+ numsortkeys = list_length(pathkeys);
+ sortColIdx = (AttrNumber *) palloc(numsortkeys * sizeof(AttrNumber));
+ sortOperators = (Oid *) palloc(numsortkeys * sizeof(Oid));
+ collations = (Oid *) palloc(numsortkeys * sizeof(Oid));
+ nullsFirst = (bool *) palloc(numsortkeys * sizeof(bool));
+
+ numsortkeys = 0;
+
+ foreach(i, pathkeys)
+ {
+ PathKey *pathkey = (PathKey *) lfirst(i);
+ EquivalenceClass *ec = pathkey->pk_eclass;
+ TargetEntry *tle = NULL;
+ Oid pk_datatype = InvalidOid;
+ Oid sortop;
+ ListCell *j;
+
+ if (ec->ec_has_volatile)
+ {
+ /*
+ * If the pathkey's EquivalenceClass is volatile, then it must
+ * have come from an ORDER BY clause, and we have to match it to
+ * that same targetlist entry.
+ */
+ if (ec->ec_sortref == 0) /* can't happen */
+ elog(ERROR, "volatile EquivalenceClass has no sortref");
+ tle = get_sortgroupref_tle(ec->ec_sortref, tlist);
+ Assert(tle);
+ Assert(list_length(ec->ec_members) == 1);
+ pk_datatype = ((EquivalenceMember *) linitial(ec->ec_members))->em_datatype;
+ }
+ else
+ {
+ /*
+ * Otherwise, we can sort by any non-constant expression listed in
+ * the pathkey's EquivalenceClass. For now, we take the first one
+ * that corresponds to an available item in the tlist. If there
+ * isn't any, use the first one that is an expression in the
+ * input's vars. (The non-const restriction only matters if the
+ * EC is below_outer_join; but if it isn't, it won't contain
+ * consts anyway, else we'd have discarded the pathkey as
+ * redundant.)
+ *
+ * XXX if we have a choice, is there any way of figuring out which
+ * might be cheapest to execute? (For example, int4lt is likely
+ * much cheaper to execute than numericlt, but both might appear
+ * in the same equivalence class...) Not clear that we ever will
+ * have an interesting choice in practice, so it may not matter.
+ */
+ foreach(j, ec->ec_members)
+ {
+ EquivalenceMember *em = (EquivalenceMember *) lfirst(j);
+
+ if (em->em_is_const)
+ continue;
+
+ tle = tlist_member((Node *) em->em_expr, tlist);
+ if (tle)
+ {
+ pk_datatype = em->em_datatype;
+ break; /* found expr already in tlist */
+ }
+
+ /*
+ * We can also use it if the pathkey expression is a relabel
+ * of the tlist entry, or vice versa. This is needed for
+ * binary-compatible cases (cf. make_pathkey_from_sortinfo).
+ * We prefer an exact match, though, so we do the basic search
+ * first.
+ */
+ tle = tlist_member_ignore_relabel((Node *) em->em_expr, tlist);
+ if (tle)
+ {
+ pk_datatype = em->em_datatype;
+ break; /* found expr already in tlist */
+ }
+ }
+
+ if (!tle)
+ {
+ /* No matching tlist item; look for a computable expression */
+ Expr *sortexpr = NULL;
+
+ foreach(j, ec->ec_members)
+ {
+ EquivalenceMember *em = (EquivalenceMember *) lfirst(j);
+ List *exprvars;
+ ListCell *k;
+
+ if (em->em_is_const)
+ continue;
+ sortexpr = em->em_expr;
+ exprvars = pull_var_clause((Node *) sortexpr,
+ PVC_INCLUDE_AGGREGATES,
+ PVC_INCLUDE_PLACEHOLDERS);
+ foreach(k, exprvars)
+ {
+ if (!tlist_member_ignore_relabel(lfirst(k), tlist))
+ break;
+ }
+ list_free(exprvars);
+ if (!k)
+ {
+ pk_datatype = em->em_datatype;
+ break; /* found usable expression */
+ }
+ }
+ if (!j)
+ elog(ERROR, "could not find pathkey item to sort");
+
+ /*
+ * Do we need to insert a Result node?
+ */
+ if (!is_projection_capable_plan(lefttree))
+ {
+ /* copy needed so we don't modify input's tlist below */
+ tlist = copyObject(tlist);
+ lefttree = (Plan *) make_result(root, tlist, NULL,
+ lefttree);
+ }
+
+ /*
+ * Add resjunk entry to input's tlist
+ */
+ tle = makeTargetEntry(sortexpr,
+ list_length(tlist) + 1,
+ NULL,
+ true);
+ tlist = lappend(tlist, tle);
+ lefttree->targetlist = tlist; /* just in case NIL before */
+ }
+ }
+
+ /*
+ * Look up the correct sort operator from the PathKey's slightly
+ * abstracted representation.
+ */
+ sortop = get_opfamily_member(pathkey->pk_opfamily,
+ pk_datatype,
+ pk_datatype,
+ pathkey->pk_strategy);
+ if (!OidIsValid(sortop)) /* should not happen */
+ elog(ERROR, "could not find member %d(%u,%u) of opfamily %u",
+ pathkey->pk_strategy, pk_datatype, pk_datatype,
+ pathkey->pk_opfamily);
+
+ /*
+ * The column might already be selected as a sort key, if the pathkeys
+ * contain duplicate entries. (This can happen in scenarios where
+ * multiple mergejoinable clauses mention the same var, for example.)
+ * So enter it only once in the sort arrays.
+ */
+ numsortkeys = add_sort_column(tle->resno,
+ sortop,
+ pathkey->pk_eclass->ec_collation,
+ pathkey->pk_nulls_first,
+ numsortkeys,
+ sortColIdx, sortOperators,
+ collations, nullsFirst);
+ }
+ Assert(numsortkeys > 0);
+
+ node->sort = makeNode(SimpleSort);
+ node->sort->numCols = numsortkeys;
+ node->sort->sortColIdx = sortColIdx;
+ node->sort->sortOperators = sortOperators;
+ node->sort->sortCollations = collations;
+ node->sort->nullsFirst = nullsFirst;
+ }
+ node->cursor = get_internal_cursor();
+ node->unique = 0;
+ return node;
+}
+#endif /* XCP */
+
+
ForeignScan *
make_foreignscan(List *qptlist,
List *qpqual,
@@ -3573,6 +5180,9 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
Sort *node = makeNode(Sort);
Plan *plan = &node->plan;
Path sort_path; /* dummy for result of cost_sort */
+#ifdef XCP
+ RemoteSubplan *pushdown;
+#endif
copy_plan_costsize(plan, lefttree); /* only care about copying size */
cost_sort(&sort_path, root, NIL,
@@ -3594,10 +5204,138 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols,
node->collations = collations;
node->nullsFirst = nullsFirst;
+#ifdef XCP
+ /*
+ * It does not makes sence to sort on one data node and then perform
+ * one-tape merge sort. So do not push sort down if there is single
+ * remote data node
+ */
+ pushdown = find_push_down_plan(lefttree, false);
+ if (pushdown)
+ {
+ /* If we already sort results, need to prepend new keys to existing */
+ /*
+ * It is not safe to share colum information.
+ * If another node will be pushed down the same RemoteSubplan column
+ * indexes may be modified and this would affect the Sort node
+ */
+ AttrNumber *newSortColIdx;
+ Oid *newSortOperators;
+ Oid *newCollations;
+ bool *newNullsFirst;
+ int newNumCols;
+ int i, j;
+
+ /*
+ * Insert new sort node immediately below the pushdown plan
+ */
+ plan->lefttree = pushdown->scan.plan.lefttree;
+ pushdown->scan.plan.lefttree = plan;
+
+ newNumCols = numCols + (pushdown->sort ? pushdown->sort->numCols : 0);
+ newSortColIdx = (AttrNumber *) palloc(newNumCols * sizeof(AttrNumber));
+ newSortOperators = (Oid *) palloc(newNumCols * sizeof(Oid));
+ newCollations = (Oid *) palloc(newNumCols * sizeof(Oid));
+ newNullsFirst = (bool *) palloc(newNumCols * sizeof(bool));
+
+ /* Copy sort columns */
+ for (i = 0; i < numCols; i++)
+ {
+ newSortColIdx[i] = sortColIdx[i];
+ newSortOperators[i] = sortOperators[i];
+ newCollations[i] = collations[i];
+ newNullsFirst[i] = nullsFirst[i];
+ }
+
+ newNumCols = numCols;
+ if (pushdown->sort)
+ {
+ /* Continue and copy old keys of the subplan which is now under the
+ * sort */
+ for (j = 0; j < pushdown->sort->numCols; j++)
+ newNumCols = add_sort_column(pushdown->sort->sortColIdx[j],
+ pushdown->sort->sortOperators[j],
+ pushdown->sort->sortCollations[j],
+ pushdown->sort->nullsFirst[j],
+ newNumCols,
+ newSortColIdx,
+ newSortOperators,
+ newCollations,
+ newNullsFirst);
+ }
+ else
+ {
+ /* Create simple sort object if does not exist */
+ pushdown->sort = makeNode(SimpleSort);
+ }
+
+ pushdown->sort->numCols = newNumCols;
+ pushdown->sort->sortColIdx = newSortColIdx;
+ pushdown->sort->sortOperators = newSortOperators;
+ pushdown->sort->sortCollations = newCollations;
+ pushdown->sort->nullsFirst = newNullsFirst;
+
+ /*
+ * lefttree is not actually a Sort, but we hope it is not important and
+ * the result will be used as a generic Plan node.
+ */
+ return (Sort *) lefttree;
+ }
+#endif
return node;
}
/*
+ * add_sort_column --- utility subroutine for building sort info arrays
+ *
+ * We need this routine because the same column might be selected more than
+ * once as a sort key column; if so, the extra mentions are redundant.
+ *
+ * Caller is assumed to have allocated the arrays large enough for the
+ * max possible number of columns. Return value is the new column count.
+ */
+static int
+add_sort_column(AttrNumber colIdx, Oid sortOp, Oid coll, bool nulls_first,
+ int numCols, AttrNumber *sortColIdx,
+ Oid *sortOperators, Oid *collations, bool *nullsFirst)
+{
+ int i;
+
+ Assert(OidIsValid(sortOp));
+
+ for (i = 0; i < numCols; i++)
+ {
+ /*
+ * Note: we check sortOp because it's conceivable that "ORDER BY foo
+ * USING <, foo USING <<<" is not redundant, if <<< distinguishes
+ * values that < considers equal. We need not check nulls_first
+ * however because a lower-order column with the same sortop but
+ * opposite nulls direction is redundant.
+ *
+ * We could probably consider sort keys with the same sortop and
+ * different collations to be redundant too, but for the moment treat
+ * them as not redundant. This will be needed if we ever support
+ * collations with different notions of equality.
+ */
+ if (sortColIdx[i] == colIdx &&
+ sortOperators[i] == sortOp &&
+ collations[i] == coll)
+ {
+ /* Already sorting by this col, so extra sort key is useless */
+ return numCols;
+ }
+ }
+
+ /* Add the column */
+ sortColIdx[numCols] = colIdx;
+ sortOperators[numCols] = sortOp;
+ collations[numCols] = coll;
+ nullsFirst[numCols] = nulls_first;
+ return numCols + 1;
+}
+
+
+/*
* prepare_sort_from_pathkeys
* Prepare to sort according to given pathkeys
*
@@ -3823,6 +5561,14 @@ prepare_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys,
true);
tlist = lappend(tlist, tle);
lefttree->targetlist = tlist; /* just in case NIL before */
+#ifdef XCP
+ /*
+ * RemoteSubplan is conditionally projection capable - it is
+ * pushing projection to the data nodes
+ */
+ if (IsA(lefttree, RemoteSubplan))
+ lefttree->lefttree->targetlist = tlist;
+#endif
}
/*
@@ -4088,6 +5834,137 @@ materialize_finished_plan(Plan *subplan)
return matplan;
}
+
+#ifdef XCP
+typedef struct
+{
+ List *subtlist;
+ List *newtlist;
+} find_referenced_cols_context;
+
+static bool
+find_referenced_cols_walker(Node *node, find_referenced_cols_context *context)
+{
+ TargetEntry *tle;
+
+ if (node == NULL)
+ return false;
+ if (IsA(node, Aggref))
+ {
+ /*
+ * We can not push down aggregates with DISTINCT.
+ */
+ if (((Aggref *) node)->aggdistinct)
+ return true;
+
+ /*
+ * We need to add aggregate reference to the new tlist if it
+ * is not already there. Phase 1 aggregate is actually returns values
+ * of transition data type, so we should change the data type of the
+ * expression.
+ */
+ if (!tlist_member(node, context->newtlist))
+ {
+ Aggref *aggref = (Aggref *) node;
+ Aggref *newagg;
+ TargetEntry *newtle;
+ HeapTuple aggTuple;
+ Form_pg_aggregate aggform;
+ Oid aggtranstype;
+ Oid aggcollecttype;
+
+ aggTuple = SearchSysCache1(AGGFNOID,
+ ObjectIdGetDatum(aggref->aggfnoid));
+ if (!HeapTupleIsValid(aggTuple))
+ elog(ERROR, "cache lookup failed for aggregate %u",
+ aggref->aggfnoid);
+ aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
+ aggtranstype = aggform->aggtranstype;
+ aggcollecttype = aggform->aggcollecttype;
+ ReleaseSysCache(aggTuple);
+
+ /* Can not split two-phase aggregate */
+ if (!OidIsValid(aggcollecttype))
+ return true;
+
+ if (IsPolymorphicType(aggtranstype))
+ {
+ Oid *inputTypes;
+ Oid *declaredArgTypes;
+ int agg_nargs;
+ int numArgs;
+ ListCell *l;
+
+ inputTypes = (Oid *) palloc(sizeof(Oid) * list_length(aggref->args));
+ numArgs = 0;
+ foreach(l, aggref->args)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+ if (!tle->resjunk)
+ inputTypes[numArgs++] = exprType((Node *) tle->expr);
+ }
+
+ /* have to fetch the agg's declared input types... */
+ (void) get_func_signature(aggref->aggfnoid,
+ &declaredArgTypes, &agg_nargs);
+ Assert(agg_nargs == numArgs);
+
+
+ aggtranstype = enforce_generic_type_consistency(inputTypes,
+ declaredArgTypes,
+ agg_nargs,
+ aggtranstype,
+ false);
+ pfree(inputTypes);
+ pfree(declaredArgTypes);
+ }
+ newagg = copyObject(aggref);
+ newagg->aggtype = aggtranstype;
+
+ newtle = makeTargetEntry((Expr *) newagg,
+ list_length(context->newtlist) + 1,
+ NULL,
+ false);
+ context->newtlist = lappend(context->newtlist, newtle);
+ }
+
+ return false;
+ }
+ /*
+ * If expression is in the subtlist copy it into new tlist
+ */
+ tle = tlist_member(node, context->subtlist);
+ if (tle && !tlist_member((Node *) tle->expr, context->newtlist))
+ {
+ TargetEntry *newtle;
+ newtle = makeTargetEntry((Expr *) copyObject(node),
+ list_length(context->newtlist) + 1,
+ tle->resname,
+ false);
+ context->newtlist = lappend(context->newtlist, newtle);
+ return false;
+ }
+ if (IsA(node, Var))
+ {
+ /*
+ * Referenced Var is not a member of subtlist.
+ * Go ahead and add junk one.
+ */
+ TargetEntry *newtle;
+ newtle = makeTargetEntry((Expr *) copyObject(node),
+ list_length(context->newtlist) + 1,
+ NULL,
+ true);
+ context->newtlist = lappend(context->newtlist, newtle);
+ return false;
+ }
+ return expression_tree_walker(node, find_referenced_cols_walker,
+ (void *) context);
+}
+#endif
+
+
Agg *
make_agg(PlannerInfo *root, List *tlist, List *qual,
AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
@@ -4099,6 +5976,9 @@ make_agg(PlannerInfo *root, List *tlist, List *qual,
Plan *plan = &node->plan;
Path agg_path; /* dummy for result of cost_agg */
QualCost qual_cost;
+#ifdef XCP
+ RemoteSubplan *pushdown;
+#endif
node->aggstrategy = aggstrategy;
node->numCols = numGroupCols;
@@ -4151,6 +6031,141 @@ make_agg(PlannerInfo *root, List *tlist, List *qual,
plan->lefttree = lefttree;
plan->righttree = NULL;
+#ifdef XCP
+ /*
+ * If lefttree is a distributed subplan we may optimize aggregates by
+ * pushing down transition phase to remote data notes, and therefore reduce
+ * traffic and distribute evaluation load.
+ * We need to find all Var and Aggref expressions in tlist and qual and make
+ * up a new tlist from these expressions. Update original Vars.
+ * Create new Agg node with the new tlist and aggdistribution AGG_SLAVE.
+ * Set new Agg node as a lefttree of the distributed subplan, moving
+ * existing lefttree down under the new Agg node. Set new tlist to the
+ * distributed subplan - it should be matching to the subquery.
+ * Set node's aggdistribution to AGG_MASTER and continue node initialization
+ */
+ pushdown = find_push_down_plan(lefttree, true);
+ if (pushdown)
+ {
+ find_referenced_cols_context context;
+
+ context.subtlist = pushdown->scan.plan.targetlist;
+ context.newtlist = NIL;
+ if (find_referenced_cols_walker((Node *) tlist, &context) ||
+ find_referenced_cols_walker((Node *) qual, &context))
+ {
+ /*
+ * We found we can not push down this aggregate, clean up and
+ * fallback to default procedure
+ */
+ node->aggdistribution = AGG_ONENODE;
+ }
+ else
+ {
+ Agg *phase1 = makeNode(Agg);
+ Plan *plan1 = &phase1->plan;
+ int i;
+
+ phase1->aggdistribution = AGG_SLAVE;
+ phase1->aggstrategy = aggstrategy;
+ phase1->numCols = numGroupCols;
+ phase1->grpColIdx = grpColIdx;
+ phase1->grpOperators = grpOperators;
+ phase1->numGroups = numGroups;
+
+ /*
+ * If we perform grouping we should make sure the grouping
+ * expressions are in the new tlist, and we should update indexes
+ * for the Phase2 aggregation node
+ */
+ if (numGroupCols > 0)
+ {
+ AttrNumber *newGrpColIdx;
+ newGrpColIdx = (AttrNumber *) palloc(sizeof(AttrNumber)
+ * numGroupCols);
+ for (i = 0; i < numGroupCols; i++)
+ {
+ TargetEntry *tle;
+ TargetEntry *newtle;
+
+ tle = (TargetEntry *) list_nth(context.subtlist,
+ grpColIdx[i] - 1);
+ newtle = tlist_member((Node *) tle->expr, context.newtlist);
+ if (newtle == NULL)
+ {
+ newtle = makeTargetEntry((Expr *) copyObject(tle->expr),
+ list_length(context.newtlist) + 1,
+ tle->resname,
+ false);
+ context.newtlist = lappend(context.newtlist, newtle);
+ }
+ newGrpColIdx[i] = newtle->resno;
+ }
+ node->grpColIdx = newGrpColIdx;
+ }
+
+ /*
+ * If the pushdown plan is sorting update sort column indexes
+ */
+ if (pushdown->sort)
+ {
+ SimpleSort *ssort = pushdown->sort;
+ for (i = 0; i < ssort->numCols; i++)
+ {
+ TargetEntry *tle;
+ TargetEntry *newtle;
+
+ tle = (TargetEntry *) list_nth(context.subtlist,
+ grpColIdx[i] - 1);
+ newtle = tlist_member((Node *) tle->expr, context.newtlist);
+ if (newtle == NULL)
+ {
+ /* XXX maybe we should just remove the sort key ? */
+ newtle = makeTargetEntry((Expr *) copyObject(tle->expr),
+ list_length(context.newtlist) + 1,
+ tle->resname,
+ false);
+ context.newtlist = lappend(context.newtlist, newtle);
+ }
+ ssort->sortColIdx[i] = newtle->resno;
+ }
+ }
+
+ copy_plan_costsize(plan1, (Plan *) pushdown); // ???
+
+ /*
+ * We will produce a single output tuple if not grouping, and a tuple per
+ * group otherwise.
+ */
+ if (aggstrategy == AGG_PLAIN)
+ plan1->plan_rows = 1;
+ else
+ plan1->plan_rows = numGroups;
+
+ plan1->targetlist = context.newtlist;
+ plan1->qual = NIL;
+ plan1->lefttree = pushdown->scan.plan.lefttree;
+ pushdown->scan.plan.lefttree = plan1;
+ plan1->righttree = NULL;
+
+ /*
+ * Update target lists of all plans from lefttree till phase1.
+ * All they should be the same if the tree is transparent for push
+ * down modification.
+ */
+ while (lefttree != plan1)
+ {
+ lefttree->targetlist = context.newtlist;
+ lefttree = lefttree->lefttree;
+ }
+
+ node->aggdistribution = AGG_MASTER;
+ }
+ }
+ else
+ node->aggdistribution = AGG_ONENODE;
+#endif
+
return node;
}
@@ -4285,6 +6300,9 @@ make_unique(Plan *lefttree, List *distinctList)
AttrNumber *uniqColIdx;
Oid *uniqOperators;
ListCell *slitem;
+#ifdef XCP
+ RemoteSubplan *pushdown;
+#endif
copy_plan_costsize(plan, lefttree);
@@ -4329,6 +6347,30 @@ make_unique(Plan *lefttree, List *distinctList)
node->uniqColIdx = uniqColIdx;
node->uniqOperators = uniqOperators;
+#ifdef XCP
+ /*
+ * We want to filter out duplicates on nodes to reduce amount of data sent
+ * over network and reduce coordinator load.
+ */
+ pushdown = find_push_down_plan(lefttree, true);
+ if (pushdown)
+ {
+ Unique *node1 = makeNode(Unique);
+ Plan *plan1 = &node1->plan;
+
+ copy_plan_costsize(plan1, pushdown->scan.plan.lefttree);
+ plan1->targetlist = pushdown->scan.plan.lefttree->targetlist;
+ plan1->qual = NIL;
+ plan1->lefttree = pushdown->scan.plan.lefttree;
+ pushdown->scan.plan.lefttree = plan1;
+ plan1->righttree = NULL;
+
+ node1->numCols = numCols;
+ node1->uniqColIdx = uniqColIdx;
+ node1->uniqOperators = uniqOperators;
+ }
+#endif
+
return node;
}
@@ -4434,6 +6476,9 @@ make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount,
{
Limit *node = makeNode(Limit);
Plan *plan = &node->plan;
+#ifdef XCP
+ RemoteSubplan *pushdown;
+#endif
copy_plan_costsize(plan, lefttree);
@@ -4492,6 +6537,37 @@ make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount,
node->limitOffset = limitOffset;
node->limitCount = limitCount;
+#ifdef XCP
+ if ((limitOffset == NULL || offset_est > 0) &&
+ (limitCount == NULL || count_est > 0))
+ {
+ /*
+ * We may reduce amount of rows sent over the network and do not send more
+ * rows then necessary
+ */
+ pushdown = find_push_down_plan(lefttree, true);
+ if (pushdown)
+ {
+ Limit *node1 = makeNode(Limit);
+ Plan *plan1 = &node1->plan;
+
+ copy_plan_costsize(plan1, pushdown->scan.plan.lefttree);
+ plan1->targetlist = pushdown->scan.plan.lefttree->targetlist;
+ plan1->qual = NIL;
+ plan1->lefttree = pushdown->scan.plan.lefttree;
+ pushdown->scan.plan.lefttree = plan1;
+ plan1->righttree = NULL;
+
+ node1->limitOffset = NULL;
+ node1->limitCount = (Node *) makeConst(INT8OID, -1,
+ InvalidOid,
+ sizeof(int64),
+ Int64GetDatum(offset_est + count_est),
+ false, FLOAT8PASSBYVAL);
+ }
+ }
+#endif
+
return node;
}
@@ -4538,6 +6614,73 @@ make_result(PlannerInfo *root,
plan->righttree = NULL;
node->resconstantqual = resconstantqual;
+#ifdef XCP
+ if (subplan)
+ {
+ /*
+ * We do not gain performance when pushing down Result, but Result on
+ * top of RemoteSubplan would not allow to push down other plan nodes
+ */
+ RemoteSubplan *pushdown;
+ pushdown = find_push_down_plan(subplan, true);
+ if (pushdown)
+ {
+ /*
+ * Avoid pushing down results if the RemoteSubplan performs merge
+ * sort.
+ */
+ if (pushdown->sort)
+ return node;
+
+ /*
+ * If remote subplan is generating distribution we should keep it
+ * correct. Set valid expression as a distribution key.
+ */
+ if (pushdown->distributionKey != InvalidAttrNumber)
+ {
+ ListCell *lc;
+ TargetEntry *key;
+
+ key = list_nth(pushdown->scan.plan.targetlist,
+ pushdown->distributionKey);
+ pushdown->distributionKey = InvalidAttrNumber;
+ foreach(lc, tlist)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+ if (equal(tle->expr, key->expr))
+ {
+ pushdown->distributionKey = tle->resno;
+ break;
+ }
+ }
+
+ if (pushdown->distributionKey != InvalidAttrNumber)
+ {
+ /* Not found, adding */
+ TargetEntry *newtle;
+ /*
+ * The target entry is *NOT* junk to ensure it is not
+ * filtered out before sending from the data node.
+ */
+ newtle = makeTargetEntry(copyObject(key->expr),
+ list_length(tlist) + 1,
+ key->resname,
+ false);
+ tlist = lappend(tlist, newtle);
+ /* just in case if it was NIL */
+ plan->targetlist = tlist;
+ pushdown->distributionKey = newtle->resno;
+ }
+ }
+ /* This will be set as lefttree of the Result plan */
+ plan->lefttree = pushdown->scan.plan.lefttree;
+ pushdown->scan.plan.lefttree = plan;
+ /* Now RemoteSubplan returns different values */
+ pushdown->scan.plan.targetlist = tlist;
+ return (Result *) subplan;
+ }
+ }
+#endif /* XCP */
return node;
}
@@ -4628,39 +6771,1502 @@ is_projection_capable_plan(Plan *plan)
case T_MergeAppend:
case T_RecursiveUnion:
return false;
+#ifdef XCP
+ /*
+ * Remote subplan may push down projection to the data nodes if do not
+ * performs merge sort
+ */
+ case T_RemoteSubplan:
+ return ((RemoteSubplan *) plan)->sort == NULL &&
+ is_projection_capable_plan(plan->lefttree);
+#endif
default:
break;
}
return true;
}
+
+#ifdef XCP
+#define CNAME_MAXLEN 32
+static int cursor_id = 0;
+
+
+/*
+ * Return a name unique for the cluster
+ */
+static char *
+get_internal_cursor(void)
+{
+ char *cursor;
+
+ cursor = (char *) palloc(CNAME_MAXLEN);
+ if (cursor_id++ == INT_MAX)
+ cursor_id = 0;
+
+ snprintf(cursor, CNAME_MAXLEN - 1, "p_%d_%x_%x",
+ PGXCNodeId, getpid(), cursor_id);
+ return cursor;
+}
+#endif
+
+
#ifdef PGXC
+#ifndef XCP
/*
- * Wrapper functions to expose some functions to PGXC planner. These functions
- * are meant to be wrappers just calling the static function in this file. If
- * you need to add more functionality, add it to the original function.
+ * findReferencedVars()
+ *
+ * Constructs a list of those Vars in targetlist which are found in
+ * parent_vars (in other words, the intersection of targetlist and
+ * parent_vars). Returns a new list in *out_tlist and a bitmap of
+ * those relids found in the result.
+ *
+ * Additionally do look at the qual references to other vars! They
+ * also need to be selected..
*/
-List *
-pgxc_order_qual_clauses(PlannerInfo *root, List *clauses)
+static void
+findReferencedVars(List *parent_vars, RemoteQuery *plan, List **out_tlist, Relids *out_relids)
{
- return order_qual_clauses(root, clauses);
+ List *vars;
+ Relids relids = NULL;
+ List *tlist = NIL;
+ ListCell *l;
+
+ /* Pull vars from both the targetlist and the clauses attached to this plan */
+ vars = pull_var_clause((Node *)plan->base_tlist,
+ PVC_RECURSE_AGGREGATES,
+ PVC_REJECT_PLACEHOLDERS);
+
+ foreach(l, vars)
+ {
+ Var *var = lfirst(l);
+
+ if (search_tlist_for_var(var, parent_vars))
+ tlist = lappend(tlist, var);
+
+ if (!bms_is_member(var->varno, relids))
+ relids = bms_add_member(relids, var->varno);
+ }
+
+ /* Now consider the local quals */
+ vars = pull_var_clause((Node *)plan->scan.plan.qual,
+ PVC_RECURSE_AGGREGATES,
+ PVC_REJECT_PLACEHOLDERS);
+
+ foreach(l, vars)
+ {
+ Var *var = lfirst(l);
+
+ if (search_tlist_for_var(var, tlist) == NULL)
+ tlist = lappend(tlist, var);
+
+ if (!bms_is_member(var->varno, relids))
+ relids = bms_add_member(relids, var->varno);
+ }
+
+ *out_tlist = tlist;
+ *out_relids = relids;
}
-List *
-pgxc_build_relation_tlist(RelOptInfo *rel)
+/*
+ * create_remoteinsert_plan()
+ *
+ * For every target relation, add a remote query node to carry out remote
+ * operations.
+ */
+Plan *
+create_remoteinsert_plan(PlannerInfo *root, Plan *topplan)
{
- return build_relation_tlist(rel);
+ ModifyTable *mt = (ModifyTable *)topplan;
+ ListCell *l;
+
+ /* We expect to work only on ModifyTable node */
+ if (!IsA(topplan, ModifyTable))
+ elog(ERROR, "Unexpected node type: %d", topplan->type);
+
+ /*
+ * For every result relation, build a remote plan to execute remote insert.
+ */
+ foreach(l, mt->resultRelations)
+ {
+ Index resultRelationIndex = lfirst_int(l);
+ RangeTblEntry *ttab;
+ RelationLocInfo *rel_loc_info;
+ StringInfo buf, buf2;
+ RemoteQuery *fstep;
+ Oid nspid;
+ char *nspname;
+ int natts, att;
+ Oid *att_types;
+ char *relname;
+ bool first_att_printed = false;
+
+ ttab = rt_fetch(resultRelationIndex, root->parse->rtable);
+
+ /* Bad relation ? */
+ if (ttab == NULL || ttab->rtekind != RTE_RELATION)
+ continue;
+
+ /* Get location info of the target table */
+ rel_loc_info = GetRelationLocInfo(ttab->relid);
+ if (rel_loc_info == NULL)
+ continue;
+
+ /* For main string */
+ buf = makeStringInfo();
+ /* For values */
+ buf2 = makeStringInfo();
+
+ /* Compose INSERT FROM target_table */
+ nspid = get_rel_namespace(ttab->relid);
+ nspname = get_namespace_name(nspid);
+ relname = get_rel_name(ttab->relid);
+
+ /*
+ * Do not qualify with namespace for TEMP tables. The schema name may
+ * vary on each node
+ */
+ if (IsTempTable(ttab->relid))
+ appendStringInfo(buf, "INSERT INTO %s (",
+ quote_identifier(relname));
+ else
+ appendStringInfo(buf, "INSERT INTO %s.%s (", quote_identifier(nspname),
+ quote_identifier(relname));
+
+ fstep = make_remotequery(NIL, NIL, resultRelationIndex);
+ fstep->is_temp = IsTempTable(ttab->relid);
+
+ natts = get_relnatts(ttab->relid);
+ att_types = (Oid *) palloc0 (sizeof (Oid) * natts);
+
+ /*
+ * Populate the column information
+ */
+ for (att = 1; att <= natts; att++)
+ {
+ HeapTuple tp;
+
+ tp = SearchSysCache(ATTNUM,
+ ObjectIdGetDatum(ttab->relid),
+ Int16GetDatum(att),
+ 0, 0);
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_attribute att_tup = (Form_pg_attribute) GETSTRUCT(tp);
+
+ /* Bypass dropped attributes in query */
+ if (att_tup->attisdropped)
+ {
+ /* Dropped attributes are casted as int4 in prepared parameters */
+ att_types[att - 1] = INT4OID;
+ }
+ else
+ {
+ /* Add comma before all except first attributes */
+ if (first_att_printed)
+ appendStringInfoString(buf, ", ");
+
+ /* Build the value part, parameters are filled at run time */
+ if (first_att_printed)
+ appendStringInfoString(buf2, ", ");
+
+ first_att_printed = true;
+
+ /* Append column name */
+ appendStringInfoString(buf, quote_identifier(NameStr(att_tup->attname)));
+
+ /* Append value in string */
+ appendStringInfo(buf2, "$%d", att);
+
+ /* Assign parameter type */
+ att_types[att - 1] = att_tup->atttypid;
+ }
+
+ ReleaseSysCache(tp);
+ }
+ else
+ elog(ERROR, "cache lookup failed for attribute %d of relation %u",
+ att, ttab->relid);
+ }
+
+ /* Gather the two strings */
+ appendStringInfo(buf, ") VALUES (%s)", buf2->data);
+
+ fstep->sql_statement = pstrdup(buf->data);
+
+ /* Processed rows are counted by the main planner */
+ fstep->combine_type = COMBINE_TYPE_NONE;
+
+ fstep->read_only = false;
+ fstep->exec_nodes = makeNode(ExecNodes);
+ fstep->exec_nodes->baselocatortype = rel_loc_info->locatorType;
+ fstep->exec_nodes->primarynodelist = NULL;
+ fstep->exec_nodes->nodeList = rel_loc_info->nodeList;
+ fstep->exec_nodes->en_relid = ttab->relid;
+ fstep->exec_nodes->accesstype = RELATION_ACCESS_INSERT;
+ fstep->exec_nodes->en_expr = pgxc_set_en_expr(ttab->relid, resultRelationIndex);
+
+ SetRemoteStatementName((Plan *) fstep, NULL, natts, att_types, 0);
+
+ /* Free everything */
+ pfree(buf->data);
+ pfree(buf);
+ pfree(buf2->data);
+ pfree(buf2);
+
+ mt->remote_plans = lappend(mt->remote_plans, fstep);
+ }
+
+ return topplan;
}
-void
-pgxc_copy_path_costsize(Plan *dest, Path *src)
+
+/*
+ * create_remoteupdate_plan()
+ *
+ * For every target relation, add a remote query node to carry out remote
+ * operations.
+ * WHERE and SET clauses are populated with the relation attributes.
+ * Target list is used for SET clause and completed with the expressions already given
+ * Those are the non-junk expressions in target list of parser tree.
+ * WHERE clause is completed by the other expressions in target tree that have been
+ * marked as junk during target list rewriting to be able to identify consistently
+ * tuples on remote Coordinators. This target list is based on the information obtained
+ * from the inner plan that should be generated by create_remotequery_plan.
+ */
+Plan *
+create_remoteupdate_plan(PlannerInfo *root, Plan *topplan)
{
- copy_path_costsize(dest, src);
+ ModifyTable *mt = (ModifyTable *)topplan;
+ ListCell *l;
+
+ /* We expect to work only on ModifyTable node */
+ if (!IsA(topplan, ModifyTable))
+ elog(ERROR, "Unexpected node type: %d", topplan->type);
+
+ /*
+ * For every result relation, build a remote plan to execute remote update.
+ */
+ foreach(l, mt->resultRelations)
+ {
+ Index resultRelationIndex = lfirst_int(l);
+ Query *parse = root->parse;
+ RangeTblEntry *ttab;
+ RelationLocInfo *rel_loc_info;
+ StringInfo buf, buf2;
+ Oid nspid; /* Relation namespace Oid */
+ char *nspname; /* Relation namespace name */
+ Oid *param_types; /* Types of query parameters */
+ bool is_set_printed = false; /* Control of SET generation */
+ bool is_where_printed = false; /* Control of WHERE generation */
+ RemoteQuery *fstep; /* Plan step generated */
+ ListCell *elt;
+ int count = 0, where_count = 1;
+ int natts, count_prepparams, tot_prepparams;
+ char *relname;
+
+ ttab = rt_fetch(resultRelationIndex, parse->rtable);
+
+ /* Bad relation ? */
+ if (ttab == NULL || ttab->rtekind != RTE_RELATION)
+ continue;
+
+ relname = get_rel_name(ttab->relid);
+
+ /* Get location info of the target table */
+ rel_loc_info = GetRelationLocInfo(ttab->relid);
+ if (rel_loc_info == NULL)
+ continue;
+
+ /* Create query buffers */
+ buf = makeStringInfo(); /* For SET clause */
+ buf2 = makeStringInfo(); /* For WHERE clause */
+
+ /* Compose UPDATE target_table */
+ natts = get_relnatts(ttab->relid);
+ nspid = get_rel_namespace(ttab->relid);
+ nspname = get_namespace_name(nspid);
+
+ /*
+ * Do not qualify with namespace for TEMP tables. The schema name may
+ * vary on each node
+ */
+ if (IsTempTable(ttab->relid))
+ appendStringInfo(buf, "UPDATE ONLY %s SET ",
+ quote_identifier(relname));
+ else
+ appendStringInfo(buf, "UPDATE ONLY %s.%s SET ", quote_identifier(nspname),
+ quote_identifier(relname));
+
+ /*
+ * Count the number of junk entries before setting the parameter type list.
+ * This helps to know how many parameters part of the WHERE clause need to
+ * be sent down by extended query protocol.
+ */
+ foreach(elt, parse->targetList)
+ {
+ TargetEntry *tle = lfirst(elt);
+ if (tle->resjunk)
+ count++;
+ }
+ count_prepparams = natts + count;
+ /* Count entries related to Rowmarks */
+ tot_prepparams = count_prepparams + pgxc_count_rowmarks_entries(root->rowMarks);
+
+ /* Then allocate the array for this purpose */
+ param_types = (Oid *) palloc0(sizeof (Oid) * tot_prepparams);
+
+ /*
+ * Now build the query based on the target list. SET clause is completed
+ * by non-junk entries and WHERE clause by junk entries used to identify
+ * uniquely tuples on remote nodes.
+ */
+ foreach(elt, parse->targetList)
+ {
+ TargetEntry *tle = lfirst(elt);
+
+ if (!tle->resjunk)
+ {
+ int attno = 0;
+ int i;
+
+ /* Add target list element to SET clause */
+
+ /* Add comma before all except first attributes */
+ if (!is_set_printed)
+ is_set_printed = true;
+ else
+ appendStringInfoString(buf, ", ");
+
+ /* We need first to find the position of this element in attribute list */
+ for (i = 0; i < natts; i++)
+ {
+ if (strcmp(tle->resname,
+ get_relid_attribute_name(ttab->relid, i + 1)) == 0)
+ {
+ attno = i + 1;
+ break;
+ }
+ }
+
+ /* Complete string */
+ appendStringInfo(buf, "%s = $%d",
+ tle->resname,
+ attno);
+
+ /* Set parameter type correctly */
+ param_types[attno - 1] = exprType((Node *) tle->expr);
+ }
+ else
+ {
+ /* Set parameter type */
+ param_types[natts + where_count - 1] = exprType((Node *) tle->expr);
+ where_count++;
+
+ /*
+ * ctid and xc_node_id are sufficient to identify
+ * remote tuple.
+ */
+ if (strcmp(tle->resname, "xc_node_id") != 0 &&
+ strcmp(tle->resname, "ctid") != 0)
+ continue;
+
+ /* Set the clause if necessary */
+ if (!is_where_printed)
+ {
+ is_where_printed = true;
+ appendStringInfoString(buf2, " WHERE ");
+ }
+ else
+ appendStringInfoString(buf2, "AND ");
+
+ /* Complete string */
+ appendStringInfo(buf2, "%s = $%d ",
+ tle->resname,
+ natts + where_count - 1);
+ }
+ }
+
+ /*
+ * Before finalizing query be sure that there are no missing entries for attributes.
+ * If there are complete the last holes. Those ones are mandatory to insure that
+ * update is executed consistently.
+ */
+ for (count = 1; count <= natts; count++)
+ {
+ if (param_types[count - 1] == 0)
+ {
+ HeapTuple tp;
+
+ tp = SearchSysCache(ATTNUM,
+ ObjectIdGetDatum(ttab->relid),
+ Int16GetDatum(count),
+ 0, 0);
+
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_attribute att_saved = (Form_pg_attribute) GETSTRUCT(tp);
+
+ /*
+ * Set parameter type of attribute
+ * Dropped columns are casted as int4
+ */
+ if (att_saved->attisdropped)
+ param_types[count - 1] = INT4OID;
+ else
+ param_types[count - 1] = att_saved->atttypid;
+ ReleaseSysCache(tp);
+ }
+ else
+ elog(ERROR, "cache lookup failed for attribute %d of relation %u",
+ count, ttab->relid);
+ }
+ }
+
+ /*
+ * The query needs to be completed by nullifying the non-parent entries
+ * defined in RowMarks. This is essential for UPDATE queries running with child
+ * entries as we need to bypass them correctly at executor level.
+ */
+ param_types = pgxc_build_rowmark_entries(root->rowMarks, parse->rtable, param_types,
+ count_prepparams, tot_prepparams);
+
+ /* Finish building the query by gathering SET and WHERE clauses */
+ appendStringInfo(buf, "%s", buf2->data);
+
+ /* Finally build the final UPDATE step */
+ fstep = make_remotequery(parse->targetList, NIL, resultRelationIndex);
+ fstep->is_temp = IsTempTable(ttab->relid);
+ fstep->sql_statement = pstrdup(buf->data);
+ fstep->combine_type = COMBINE_TYPE_NONE;
+
+ fstep->read_only = false;
+ /*
+ * Get the nodes to execute the query on. We will execute this query on
+ * all nodes. The WHERE condition will take care of updating the columns
+ * accordingly.
+ */
+ fstep->exec_nodes = GetRelationNodes(rel_loc_info, 0, true, UNKNOWNOID, RELATION_ACCESS_UPDATE);
+ fstep->exec_nodes->baselocatortype = rel_loc_info->locatorType;
+ fstep->exec_nodes->en_relid = ttab->relid;
+ fstep->exec_nodes->nodeList = rel_loc_info->nodeList;
+ fstep->exec_nodes->accesstype = RELATION_ACCESS_UPDATE;
+ fstep->exec_nodes->en_expr = pgxc_set_en_expr(ttab->relid, resultRelationIndex);
+ SetRemoteStatementName((Plan *) fstep, NULL, tot_prepparams, param_types, 0);
+ pfree(buf->data);
+ pfree(buf2->data);
+ pfree(buf);
+ pfree(buf2);
+
+ mt->remote_plans = lappend(mt->remote_plans, fstep);
+ }
+
+ return topplan;
+}
+
+/*
+ * create_remotedelete_plan()
+ *
+ * For every target relation, add a remote query node to carry out remote
+ * operations. The tuple to be deleted is selected depending on the target
+ * list of given plan, generating parametrized WHERE clause in consequence.
+ */
+Plan *
+create_remotedelete_plan(PlannerInfo *root, Plan *topplan)
+{
+ ModifyTable *mt = (ModifyTable *)topplan;
+ ListCell *l;
+
+ /* We expect to work only on ModifyTable node */
+ if (!IsA(topplan, ModifyTable))
+ elog(ERROR, "Unexpected node type: %d", topplan->type);
+
+ /*
+ * For every result relation, build a remote plan to execute remote delete.
+ */
+ foreach(l, mt->resultRelations)
+ {
+ Index resultRelationIndex = lfirst_int(l);
+ Query *parse = root->parse;
+ RangeTblEntry *ttab;
+ RelationLocInfo *rel_loc_info;
+ StringInfo buf;
+ Oid nspid; /* Relation namespace Oid */
+ char *nspname; /* Relation namespace name */
+ int count_prepparams, tot_prepparams; /* Attribute used is CTID */
+ Oid *param_types; /* Types of query parameters */
+ RemoteQuery *fstep; /* Plan step generated */
+ bool is_where_created = false;
+ ListCell *elt;
+ int count = 1;
+ char *relname;
+
+ ttab = rt_fetch(resultRelationIndex, parse->rtable);
+
+ /* Bad relation ? */
+ if (ttab == NULL || ttab->rtekind != RTE_RELATION)
+ continue;
+
+ /* Get location info of the target table */
+ rel_loc_info = GetRelationLocInfo(ttab->relid);
+ if (rel_loc_info == NULL)
+ continue;
+
+ /* Create query buffers */
+ buf = makeStringInfo();
+
+ /* Compose DELETE target_table */
+ nspid = get_rel_namespace(ttab->relid);
+ nspname = get_namespace_name(nspid);
+ relname = get_rel_name(ttab->relid);
+
+ /* Parameters are defined by target list */
+ count_prepparams = list_length(parse->targetList);
+
+ /* Count entries related to Rowmarks only if there are child relations here */
+ if (list_length(mt->resultRelations) != 1)
+ tot_prepparams = count_prepparams + pgxc_count_rowmarks_entries(root->rowMarks);
+ else
+ tot_prepparams = count_prepparams;
+
+ param_types = (Oid *) palloc0(sizeof(Oid) * tot_prepparams);
+
+ /*
+ * Do not qualify with namespace for TEMP tables. The schema name may
+ * vary on each node.
+ */
+ if (IsTempTable(ttab->relid))
+ appendStringInfo(buf, "DELETE FROM ONLY %s ",
+ quote_identifier(relname));
+ else
+ appendStringInfo(buf, "DELETE FROM ONLY %s.%s ", quote_identifier(nspname),
+ quote_identifier(relname));
+
+ /* Generate WHERE clause for each target list item */
+ foreach(elt, parse->targetList)
+ {
+ TargetEntry *tle = lfirst(elt);
+
+ /* Set up the parameter type */
+ param_types[count - 1] = exprType((Node *) tle->expr);
+ count++;
+
+ /*
+ * In WHERE clause, ctid and xc_node_id are
+ * sufficient to fetch a tuple from remote node.
+ */
+ if (strcmp(tle->resname, "xc_node_id") != 0 &&
+ strcmp(tle->resname, "ctid") != 0)
+ continue;
+
+ /* Set the clause if necessary */
+ if (!is_where_created)
+ {
+ is_where_created = true;
+ appendStringInfoString(buf, "WHERE ");
+ }
+ else
+ appendStringInfoString(buf, "AND ");
+
+ appendStringInfo(buf, "%s = $%d ",
+ quote_identifier(tle->resname),
+ count - 1);
+ }
+
+ /*
+ * The query needs to be completed by nullifying the non-parent entries
+ * defined in RowMarks. This is essential for UPDATE queries running with child
+ * entries as we need to bypass them correctly at executor level.
+ */
+ param_types = pgxc_build_rowmark_entries(root->rowMarks, parse->rtable, param_types,
+ count_prepparams, tot_prepparams);
+
+ /* Finish by building the plan step */
+ fstep = make_remotequery(parse->targetList, NIL, resultRelationIndex);
+ fstep->is_temp = IsTempTable(ttab->relid);
+ fstep->sql_statement = pstrdup(buf->data);
+ fstep->combine_type = COMBINE_TYPE_NONE;
+
+ fstep->read_only = false;
+ /*
+ * Get the nodes to execute the query on. We will execute this query on
+ * all nodes. The WHERE condition will take care of updating the columns
+ * accordingly.
+ */
+ fstep->exec_nodes = GetRelationNodes(rel_loc_info, 0, true, UNKNOWNOID,
+ RELATION_ACCESS_UPDATE);
+ fstep->exec_nodes->baselocatortype = rel_loc_info->locatorType;
+ fstep->exec_nodes->en_relid = ttab->relid;
+ fstep->exec_nodes->nodeList = rel_loc_info->nodeList;
+ fstep->exec_nodes->accesstype = RELATION_ACCESS_UPDATE;
+ SetRemoteStatementName((Plan *) fstep, NULL, tot_prepparams, param_types, 0);
+ pfree(buf->data);
+ pfree(buf);
+
+ mt->remote_plans = lappend(mt->remote_plans, fstep);
+ }
+
+ return topplan;
}
+
+/*
+ * create_remotegrouping_plan
+ * Check if the grouping and aggregates can be pushed down to the
+ * Datanodes.
+ * Right now we can push with following restrictions
+ * 1. there are plain aggregates (no expressions involving aggregates) and/or
+ * expressions in group by clauses
+ * 2. No distinct or order by clauses
+ * 3. No windowing clause
+ * 4. No having clause
+ *
+ * Inputs
+ * root - planerInfo root for this query
+ * agg_plan - local grouping plan produced by grouping_planner()
+ *
+ * PGXCTODO: work on reducing these restrictions as much or document the reasons
+ * why we need the restrictions, in these comments themselves. In case of
+ * replicated tables, we should be able to push the whole query to the data
+ * node in case there are no local clauses.
+ */
Plan *
-pgxc_create_gating_plan(PlannerInfo *root, Plan *plan, List *quals)
+create_remotegrouping_plan(PlannerInfo *root, Plan *local_plan)
+{
+ Query *query = root->parse;
+ Sort *sort_plan;
+ RemoteQuery *remote_scan; /* remote query in the passed in plan */
+ RemoteQuery *remote_group; /* remote query after optimization */
+ Plan *remote_group_plan; /* plan portion of remote_group */
+ Plan *temp_plan;
+ List *temp_vars; /* temporarily hold the VARs */
+ List *temp_vartlist; /* temporarity hold tlist of VARs */
+ ListCell *temp;
+ StringInfo remote_targetlist;/* SELECT clause of remote query */
+ StringInfo remote_sql_stmt;
+ StringInfo groupby_clause; /* remote query GROUP BY */
+ StringInfo orderby_clause; /* remote query ORDER BY */
+ StringInfo remote_fromlist; /* remote query FROM */
+ StringInfo in_alias;
+ StringInfo having_clause; /* remote query HAVING clause */
+ Relids in_relids; /* the list of Relids referenced by lefttree */
+ Index dummy_rtindex;
+ List *base_tlist;
+ RangeTblEntry *dummy_rte;
+ int numGroupCols;
+ AttrNumber *grpColIdx;
+ bool reduce_plan;
+ List *remote_qual;
+ List *local_qual;
+
+ /* Remote grouping is not enabled, don't do anything */
+ if (!enable_remotegroup)
+ return local_plan;
+ /*
+ * We don't push aggregation and grouping to Datanodes, in case there are
+ * windowing aggregates, distinct, having clause or sort clauses.
+ */
+ if (query->hasWindowFuncs ||
+ query->distinctClause ||
+ query->sortClause)
+ return local_plan;
+
+ /* for now only Agg/Group plans */
+ if (local_plan && IsA(local_plan, Agg))
+ {
+ numGroupCols = ((Agg *)local_plan)->numCols;
+ grpColIdx = ((Agg *)local_plan)->grpColIdx;
+ }
+ else if (local_plan && IsA(local_plan, Group))
+ {
+ numGroupCols = ((Group *)local_plan)->numCols;
+ grpColIdx = ((Group *)local_plan)->grpColIdx;
+ }
+ else
+ return local_plan;
+
+ /*
+ * We expect plan tree as Group/Agg->Sort->Result->Material->RemoteQuery,
+ * Result, Material nodes are optional. Sort is compulsory for Group but not
+ * for Agg.
+ * anything else is not handled right now.
+ */
+ temp_plan = local_plan->lefttree;
+ remote_scan = NULL;
+ sort_plan = NULL;
+ if (temp_plan && IsA(temp_plan, Sort))
+ {
+ sort_plan = (Sort *)temp_plan;
+ temp_plan = temp_plan->lefttree;
+ }
+ if (temp_plan && IsA(temp_plan, Result))
+ temp_plan = temp_plan->lefttree;
+ if (temp_plan && IsA(temp_plan, Material))
+ temp_plan = temp_plan->lefttree;
+ if (temp_plan && IsA(temp_plan, RemoteQuery))
+ remote_scan = (RemoteQuery *)temp_plan;
+
+ if (!remote_scan)
+ return local_plan;
+ /*
+ * for Group plan we expect Sort under the Group, which is always the case,
+ * the condition below is really for some possibly non-existent case.
+ */
+ if (IsA(local_plan, Group) && !sort_plan)
+ return local_plan;
+ /*
+ * If the remote_scan has any quals on it, those need to be executed before
+ * doing anything. Hence we won't be able to push any aggregates or grouping
+ * to the Datanode.
+ * If it has any SimpleSort in it, then sorting is intended to be applied
+ * before doing anything. Hence can not push any aggregates or grouping to
+ * the Datanode.
+ */
+ if (remote_scan->scan.plan.qual || remote_scan->sort)
+ return local_plan;
+
+ /*
+ * Grouping_planner may add Sort node to sort the rows
+ * based on the columns in GROUP BY clause. Hence the columns in Sort and
+ * those in Group node in should be same. The columns are usually in the
+ * same order in both nodes, hence check the equality in order. If this
+ * condition fails, we can not handle this plan for now.
+ */
+ if (sort_plan)
+ {
+ int cntCols;
+ if (sort_plan->numCols != numGroupCols)
+ return local_plan;
+ for (cntCols = 0; cntCols < numGroupCols; cntCols++)
+ {
+ if (sort_plan->sortColIdx[cntCols] != grpColIdx[cntCols])
+ return local_plan;
+ }
+ }
+
+ /*
+ * At last we find the plan underneath is reducible into a single
+ * RemoteQuery node.
+ */
+
+ /* find all the relations referenced by targetlist of Grouping node */
+ temp_vars = pull_var_clause((Node *)local_plan->targetlist,
+ PVC_RECURSE_AGGREGATES,
+ PVC_REJECT_PLACEHOLDERS);
+ findReferencedVars(temp_vars, remote_scan, &temp_vartlist, &in_relids);
+
+ /*
+ * process the targetlist of the grouping plan, also construct the
+ * targetlist of the query to be shipped to the remote side
+ */
+ base_tlist = pgxc_process_grouping_targetlist(root, &(local_plan->targetlist));
+ /*
+ * If can not construct a targetlist shippable to the Datanode. Resort to
+ * the plan created by grouping_planner()
+ */
+ if (!base_tlist)
+ return local_plan;
+
+ base_tlist = pgxc_process_having_clause(root, base_tlist, query->havingQual,
+ &local_qual, &remote_qual, &reduce_plan);
+ /*
+ * Because of HAVING clause, we can not push the aggregates and GROUP BY
+ * clause to the Datanode. Resort to the plan created by grouping planner.
+ */
+ if (!reduce_plan)
+ return local_plan;
+ Assert(base_tlist);
+
+ /*
+ * We are now ready to create the RemoteQuery node to push the query to
+ * Datanode.
+ * 1. Create a remote query node reflecting the query to be pushed to the
+ * Datanode.
+ * 2. Modify the Grouping node passed in, to accept the results sent by the
+ * Datanodes, then group and aggregate them, if needed.
+ */
+ remote_targetlist = makeStringInfo();
+ remote_sql_stmt = makeStringInfo();
+ groupby_clause = makeStringInfo();
+ orderby_clause = makeStringInfo();
+ remote_fromlist = makeStringInfo();
+ in_alias = makeStringInfo();
+ having_clause = makeStringInfo();
+
+ appendStringInfo(in_alias, "%s_%d", "group", root->rs_alias_index);
+
+ /*
+ * Build partial RemoteQuery node to be used for creating the Select clause
+ * to be sent to the remote node. Rest of the node will be built later
+ */
+ remote_group = makeNode(RemoteQuery);
+
+ /*
+ * Save information about the plan we are reducing.
+ * We may need this information later if more entries are added to it
+ * as part of the remote expression optimization.
+ */
+ remote_group->inner_alias = pstrdup(in_alias->data);
+ remote_group->inner_reduce_level = remote_scan->reduce_level;
+ remote_group->inner_relids = in_relids;
+ remote_group->inner_statement = pstrdup(remote_scan->sql_statement);
+ remote_group->exec_nodes = remote_scan->exec_nodes;
+ /* Don't forget to increment the index for the next time around! */
+ remote_group->reduce_level = root->rs_alias_index++;
+ /* Remember if the remote query is accessing a temporary object */
+ remote_group->is_temp = remote_scan->is_temp;
+
+ /* Generate the select clause of the remote query */
+ appendStringInfoString(remote_targetlist, "SELECT");
+ foreach (temp, base_tlist)
+ {
+ TargetEntry *tle = lfirst(temp);
+ Node *expr = (Node *)tle->expr;
+
+ create_remote_expr(root, local_plan, remote_targetlist, expr, remote_group);
+
+ /* If this is not last target entry, add a comma */
+ if (lnext(temp))
+ appendStringInfoString(remote_targetlist, ",");
+ }
+
+ /* Generate the from clause of the remote query */
+ appendStringInfo(remote_fromlist, " FROM (%s) %s",
+ remote_group->inner_statement, remote_group->inner_alias);
+
+ /*
+ * Generate group by clause for the remote query and recompute the group by
+ * column locations. We want the tuples from remote node to be ordered by
+ * the grouping columns so that ExecGroup can work without any modification,
+ * hence create a SimpleSort structure to be added to RemoteQuery (which
+ * will merge the sorted results and present to Group node in sorted
+ * manner).
+ */
+ if (query->groupClause)
+ {
+ int cntCols;
+ char *sep;
+
+ /*
+ * recompute the column ids of the grouping columns,
+ * the group column indexes computed earlier point in the
+ * targetlists of the scan plans under this node. But now the grouping
+ * column indexes will be pointing in the targetlist of the new
+ * RemoteQuery, hence those need to be recomputed
+ */
+ pgxc_locate_grouping_columns(root, base_tlist, grpColIdx);
+
+ appendStringInfoString(groupby_clause, "GROUP BY ");
+ sep = "";
+ for (cntCols = 0; cntCols < numGroupCols; cntCols++)
+ {
+ appendStringInfo(groupby_clause, "%s%d", sep, grpColIdx[cntCols]);
+ sep = ", ";
+ }
+ if (sort_plan)
+ {
+ SimpleSort *remote_sort = makeNode(SimpleSort);
+ /*
+ * reuse the arrays allocated in sort_plan to create SimpleSort
+ * structure. sort_plan is useless henceforth.
+ */
+ remote_sort->numCols = sort_plan->numCols;
+ remote_sort->sortColIdx = sort_plan->sortColIdx;
+ remote_sort->sortOperators = sort_plan->sortOperators;
+ remote_sort->sortCollations = sort_plan->collations;
+ remote_sort->nullsFirst = sort_plan->nullsFirst;
+ appendStringInfoString(orderby_clause, "ORDER BY ");
+ sep = "";
+ for (cntCols = 0; cntCols < remote_sort->numCols; cntCols++)
+ {
+ remote_sort->sortColIdx[cntCols] = grpColIdx[cntCols];
+ appendStringInfo(orderby_clause, "%s%d", sep,
+ remote_sort->sortColIdx[cntCols]);
+ sep = ", ";
+ }
+ remote_group->sort = remote_sort;
+ }
+ }
+
+ if (remote_qual)
+ {
+ appendStringInfoString(having_clause, "HAVING ");
+ create_remote_clause_expr(root, local_plan, having_clause, remote_qual,
+ remote_group);
+ }
+
+ /* Generate the remote sql statement from the pieces */
+ appendStringInfo(remote_sql_stmt, "%s %s %s %s %s", remote_targetlist->data,
+ remote_fromlist->data, groupby_clause->data,
+ orderby_clause->data, having_clause->data);
+ /*
+ * Create a dummy RTE for the remote query being created. Append the dummy
+ * range table entry to the range table. Note that this modifies the master
+ * copy the caller passed us, otherwise e.g EXPLAIN VERBOSE will fail to
+ * find the rte the Vars built below refer to. Also create the tuple
+ * descriptor for the result of this query from the base_tlist (targetlist
+ * we used to generate the remote node query).
+ */
+ dummy_rte = make_dummy_remote_rte("__REMOTE_GROUP_QUERY__",
+ makeAlias("__REMOTE_GROUP_QUERY__", NIL));
+ /* Rest will be zeroed out in makeNode() */
+ root->parse->rtable = lappend(root->parse->rtable, dummy_rte);
+ dummy_rtindex = list_length(root->parse->rtable);
+
+ /* Build rest of the RemoteQuery node and the plan there */
+ remote_group_plan = &remote_group->scan.plan;
+
+ /* The join targetlist becomes this node's tlist */
+ remote_group_plan->targetlist = base_tlist;
+ remote_group_plan->lefttree = NULL;
+ remote_group_plan->righttree = NULL;
+ remote_group->scan.scanrelid = dummy_rtindex;
+ remote_group->sql_statement = remote_sql_stmt->data;
+
+ /* set_plan_refs needs this later */
+ remote_group->read_only = (query->commandType == CMD_SELECT && !query->hasForUpdate);
+ remote_group->has_row_marks = query->hasForUpdate;
+ remote_group->base_tlist = base_tlist;
+
+ /* we actually need not worry about costs since this is the final plan */
+ remote_group_plan->startup_cost = remote_scan->scan.plan.startup_cost;
+ remote_group_plan->total_cost = remote_scan->scan.plan.total_cost;
+ remote_group_plan->plan_rows = remote_scan->scan.plan.plan_rows;
+ remote_group_plan->plan_width = remote_scan->scan.plan.plan_width;
+
+ /*
+ * Modify the passed in grouping plan according to the remote query we built
+ * Materialization is always needed for RemoteQuery in case we need to restart
+ * the scan.
+ */
+ local_plan->lefttree = remote_group_plan;
+ local_plan->qual = local_qual;
+ /* indicate that we should apply collection function directly */
+ if (IsA(local_plan, Agg))
+ ((Agg *)local_plan)->skip_trans = true;
+
+ return local_plan;
+}
+
+/*
+ * pgxc_locate_grouping_columns
+ * Locates the grouping clauses in the given target list. This is very similar
+ * to locate_grouping_columns except that there is only one target list to
+ * search into.
+ * PGXCTODO: Can we reuse locate_grouping_columns() instead of writing this
+ * function? But this function is optimized to search in the same target list.
+ */
+static void
+pgxc_locate_grouping_columns(PlannerInfo *root, List *tlist,
+ AttrNumber *groupColIdx)
+{
+ int keyno = 0;
+ ListCell *gl;
+
+ /*
+ * No work unless grouping.
+ */
+ if (!root->parse->groupClause)
+ {
+ Assert(groupColIdx == NULL);
+ return;
+ }
+ Assert(groupColIdx != NULL);
+
+ foreach(gl, root->parse->groupClause)
+ {
+ SortGroupClause *grpcl = (SortGroupClause *) lfirst(gl);
+ TargetEntry *te = get_sortgroupclause_tle(grpcl, tlist);
+ if (!te)
+ elog(ERROR, "failed to locate grouping columns");
+ groupColIdx[keyno++] = te->resno;
+ }
+}
+
+/*
+ * pgxc_add_node_to_grouping_tlist
+ * Add the given node to the target list to be sent to the Datanode. If it's
+ * Aggref node, also change the passed in node to point to the Aggref node in
+ * the Datanode's target list
+ */
+static List *
+pgxc_add_node_to_grouping_tlist(List *remote_tlist, Node *expr, Index ressortgroupref)
{
- return create_gating_plan(root, plan, quals);
+ TargetEntry *remote_tle;
+ Oid saved_aggtype = InvalidOid;
+
+ /*
+ * When we add an aggregate to the remote targetlist the aggtype of such
+ * Aggref node is changed to aggtrantype. Hence while searching a given
+ * Aggref in remote targetlist, we need to change the aggtype accordingly
+ * and then switch it back.
+ */
+ if (IsA(expr, Aggref))
+ {
+ Aggref *aggref = (Aggref *)expr;
+ saved_aggtype = aggref->aggtype;
+ aggref->aggtype = aggref->aggtrantype;
+ }
+ remote_tle = tlist_member(expr, remote_tlist);
+ if (IsA(expr, Aggref))
+ ((Aggref *)expr)->aggtype = saved_aggtype;
+
+ if (!remote_tle)
+ {
+ remote_tle = makeTargetEntry(copyObject(expr),
+ list_length(remote_tlist) + 1,
+ NULL,
+ false);
+ /* Copy GROUP BY/SORT BY reference for the locating group by columns */
+ remote_tle->ressortgroupref = ressortgroupref;
+ remote_tlist = lappend(remote_tlist, remote_tle);
+ }
+ else
+ {
+ if (remote_tle->ressortgroupref == 0)
+ remote_tle->ressortgroupref = ressortgroupref;
+ else if (ressortgroupref == 0)
+ {
+ /* do nothing remote_tle->ressortgroupref has the right value */
+ }
+ else
+ {
+ /*
+ * if the expression's TLE already has a Sorting/Grouping reference,
+ * and caller has passed a non-zero one as well, better both of them
+ * be same
+ */
+ Assert(remote_tle->ressortgroupref == ressortgroupref);
+ }
+ }
+
+ /*
+ * Replace the args of the local Aggref with Aggref node to be
+ * included in RemoteQuery node, so that set_plan_refs can convert
+ * the args into VAR pointing to the appropriate result in the tuple
+ * coming from RemoteQuery node
+ * PGXCTODO: should we push this change in targetlists of plans
+ * above?
+ */
+ if (IsA(expr, Aggref))
+ {
+ Aggref *local_aggref = (Aggref *)expr;
+ Aggref *remote_aggref = (Aggref *)remote_tle->expr;
+ Assert(IsA(remote_tle->expr, Aggref));
+ remote_aggref->aggtype = remote_aggref->aggtrantype;
+ /* Is copyObject() needed here? probably yes */
+ local_aggref->args = list_make1(makeTargetEntry(copyObject(remote_tle->expr),
+ 1, NULL,
+ false));
+ }
+ return remote_tlist;
+}
+/*
+ * pgxc_process_grouping_targetlist
+ * The function scans the targetlist to check if the we can push anything
+ * from the targetlist to the Datanode. Following rules govern the choice
+ * 1. Either all of the aggregates are pushed to the Datanode or none is pushed
+ * 2. If there are no aggregates, the targetlist is good to be shipped as is
+ * 3. If aggregates are involved in expressions, we push the aggregates to the
+ * Datanodes but not the involving expressions.
+ *
+ * The function constructs the targetlist for the query to be pushed to the
+ * Datanode. It modifies the local targetlist to point to the expressions in
+ * remote targetlist wherever necessary (e.g. aggregates)
+ *
+ * PGXCTODO: we should be careful while pushing the function expressions, it's
+ * better to push functions like strlen() which can be evaluated at the
+ * Datanode, but we should avoid pushing functions which can only be evaluated
+ * at Coordinator.
+ */
+static List *
+pgxc_process_grouping_targetlist(PlannerInfo *root, List **local_tlist)
+{
+ bool shippable_remote_tlist = true;
+ List *remote_tlist = NIL;
+ List *orig_local_tlist = NIL;/* Copy original local_tlist, in case it changes */
+ ListCell *temp;
+
+ /*
+ * Walk through the target list and find out whether we can push the
+ * aggregates and grouping to Datanodes. Also while doing so, create the
+ * targetlist for the query to be shipped to the Datanode. Adjust the local
+ * targetlist accordingly.
+ */
+ foreach(temp, *local_tlist)
+ {
+ TargetEntry *local_tle = lfirst(temp);
+ Node *expr = (Node *)local_tle->expr;
+ bool has_aggs;
+
+ /*
+ * If the expression is not Aggref but involves aggregates (has Aggref
+ * nodes in the expression tree, we can not push the entire expression
+ * to the Datanode, but push those aggregates to the Datanode, if those
+ * aggregates can be evaluated at the Datanodes (if is_foreign_expr
+ * returns true for entire expression). To evaluate the rest of the
+ * expression, we need to fetch the values of VARs participating in the
+ * expression. But, if we include the VARs under the aggregate nodes,
+ * they may not be part of GROUP BY clause, thus generating an invalid
+ * query. Hence, is_foreign_expr() wouldn't collect VARs under the
+ * expression tree rooted under Aggref node.
+ * For example, the original query is
+ * SELECT sum(val) * val2 FROM tab1 GROUP BY val2;
+ * the query pushed to the Datanode is
+ * SELECT sum(val), val2 FROM tab1 GROUP BY val2;
+ * Notice that, if we include val in the query, it will become invalid.
+ */
+ if (!pgxc_is_expr_shippable((Expr *)expr, &has_aggs))
+ {
+ shippable_remote_tlist = false;
+ break;
+ }
+
+ /*
+ * We are about to change the local_tlist, check if we have already
+ * copied original local_tlist, if not take a copy
+ */
+ if (!orig_local_tlist && has_aggs)
+ orig_local_tlist = copyObject(*local_tlist);
+
+ /*
+ * If there are aggregates involved in the expression, whole expression
+ * can not be pushed to the Datanode. Pick up the aggregates and the
+ * VAR nodes not covered by aggregates.
+ */
+ if (has_aggs)
+ {
+ ListCell *lcell;
+ List *aggs_n_vars;
+ /*
+ * This expression is not going to be pushed as whole, thus other
+ * clauses won't be able to find out this TLE in the results
+ * obtained from Datanode. Hence can't optimize this query.
+ * PGXCTODO: with projection support in RemoteQuery node, this
+ * condition can be worked around, please check.
+ */
+ if (local_tle->ressortgroupref > 0)
+ {
+ shippable_remote_tlist = false;
+ break;
+ }
+
+ aggs_n_vars = pull_var_clause(expr, PVC_INCLUDE_AGGREGATES,
+ PVC_RECURSE_PLACEHOLDERS);
+ /* copy the aggregates into the remote target list */
+ foreach (lcell, aggs_n_vars)
+ {
+ Assert(IsA(lfirst(lcell), Aggref) || IsA(lfirst(lcell), Var));
+ remote_tlist = pgxc_add_node_to_grouping_tlist(remote_tlist, lfirst(lcell),
+ 0);
+ }
+ }
+ /* Expression doesn't contain any aggregate */
+ else
+ remote_tlist = pgxc_add_node_to_grouping_tlist(remote_tlist, expr,
+ local_tle->ressortgroupref);
+ }
+
+ if (!shippable_remote_tlist)
+ {
+ /*
+ * If local_tlist has changed but we didn't find anything shippable to
+ * Datanode, we need to restore the local_tlist to original state,
+ */
+ if (orig_local_tlist)
+ *local_tlist = orig_local_tlist;
+ if (remote_tlist)
+ list_free_deep(remote_tlist);
+ remote_tlist = NIL;
+ }
+ else if (orig_local_tlist)
+ {
+ /*
+ * If we have changed the targetlist passed, we need to pass back the
+ * changed targetlist. Free the copy that has been created.
+ */
+ list_free_deep(orig_local_tlist);
+ }
+
+ return remote_tlist;
+}
+
+/*
+ * pgxc_process_having_clause
+ * For every expression in the havingQual take following action
+ * 1. If it has aggregates, which can be evaluated at the Datanodes, add those
+ * aggregates to the targetlist and modify the local aggregate expressions to
+ * point to the aggregate expressions being pushed to the Datanode. Add this
+ * expression to the local qual to be evaluated locally.
+ * 2. If the expression does not have aggregates and the whole expression can be
+ * evaluated at the Datanode, add the expression to the remote qual to be
+ * evaluated at the Datanode.
+ * 3. If qual contains an expression which can not be evaluated at the data
+ * node, the parent group plan can not be reduced to a remote_query.
+ */
+static List *
+pgxc_process_having_clause(PlannerInfo *root, List *remote_tlist, Node *havingQual,
+ List **local_qual, List **remote_qual,
+ bool *reduce_plan)
+{
+ List *qual;
+ ListCell *temp;
+
+ *reduce_plan = true;
+ *remote_qual = NIL;
+ *local_qual = NIL;
+
+ if (!havingQual)
+ return remote_tlist;
+ /*
+ * PGXCTODO: we expect the quals in the form of List only. Is there a
+ * possibility that the quals will be another form?
+ */
+ if (!IsA(havingQual, List))
+ {
+ *reduce_plan = false;
+ return remote_tlist;
+ }
+ /*
+ * Copy the havingQual so that the copy can be modified later. In case we
+ * back out in between, the original expression remains intact.
+ */
+ qual = copyObject(havingQual);
+ foreach(temp, qual)
+ {
+ Node *expr = lfirst(temp);
+ bool has_aggs;
+ List *vars_n_aggs;
+
+ if (!pgxc_is_expr_shippable((Expr *)expr, &has_aggs))
+ {
+ *reduce_plan = false;
+ break;
+ }
+
+ if (has_aggs)
+ {
+ ListCell *lcell;
+
+ /* Pull the aggregates and var nodes from the quals */
+ vars_n_aggs = pull_var_clause(expr, PVC_INCLUDE_AGGREGATES,
+ PVC_RECURSE_PLACEHOLDERS);
+ /* copy the aggregates into the remote target list */
+ foreach (lcell, vars_n_aggs)
+ {
+ Assert(IsA(lfirst(lcell), Aggref) || IsA(lfirst(lcell), Var));
+ remote_tlist = pgxc_add_node_to_grouping_tlist(remote_tlist, lfirst(lcell),
+ 0);
+ }
+ *local_qual = lappend(*local_qual, expr);
+ }
+ else
+ *remote_qual = lappend(*remote_qual, expr);
+ }
+
+ if (!(*reduce_plan))
+ list_free_deep(qual);
+
+ return remote_tlist;
+}
+
+/*
+ * pgxc_set_en_expr
+ * Try to find the expression of distribution column to calculate node at plan execution
+ */
+static Expr *
+pgxc_set_en_expr(Oid tableoid, Index resultRelationIndex)
+{
+ HeapTuple tp;
+ Form_pg_attribute partAttrTup;
+ Var *var;
+ RelationLocInfo *rel_loc_info;
+
+ /* Get location info of the target table */
+ rel_loc_info = GetRelationLocInfo(tableoid);
+ if (rel_loc_info == NULL)
+ return NULL;
+
+ /*
+ * For hash/modulo distributed tables, the target node must be selected
+ * at the execution time based on the partition column value.
+ *
+ * For round robin distributed tables, tuples must be divided equally
+ * between the nodes.
+ *
+ * For replicated tables, tuple must be inserted in all the Datanodes
+ *
+ * XXX Need further testing for replicated and round-robin tables
+ */
+ if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH &&
+ rel_loc_info->locatorType != LOCATOR_TYPE_MODULO)
+ return NULL;
+
+ tp = SearchSysCache(ATTNUM,
+ ObjectIdGetDatum(tableoid),
+ Int16GetDatum(rel_loc_info->partAttrNum),
+ 0, 0);
+ partAttrTup = (Form_pg_attribute) GETSTRUCT(tp);
+
+ /*
+ * Create a Var for the distribution column and set it for
+ * execution time evaluation of target node. ExecEvalVar() picks
+ * up values from ecxt_scantuple if Var does not refer either OUTER
+ * or INNER varno. We utilize that mechanism to pick up values from
+ * the tuple returned by the current plan node
+ */
+ var = makeVar(resultRelationIndex,
+ rel_loc_info->partAttrNum,
+ partAttrTup->atttypid,
+ partAttrTup->atttypmod,
+ partAttrTup->attcollation,
+ 0);
+ ReleaseSysCache(tp);
+
+ return (Expr *) var;
+}
+
+/*
+ * pgxc_count_rowmarks_entries
+ * Count the number of rowmarks that need to be added as prepared parameters
+ * for remote DML plan
+ */
+static int
+pgxc_count_rowmarks_entries(List *rowMarks)
+{
+ int res = 0;
+ ListCell *elt;
+
+ foreach(elt, rowMarks)
+ {
+ PlanRowMark *rc = (PlanRowMark *) lfirst(elt);
+
+ /* RowMarks with different parent are not needed */
+ if (rc->rti != rc->prti)
+ continue;
+
+ /*
+ * Count the entry and move to next element
+ * For a non-parent rowmark, only ctid is used.
+ * For a parent rowmark, ctid and tableoid are used.
+ */
+ if (!rc->isParent)
+ res++;
+ else
+ res = res + 2;
+ }
+
+ return res;
+}
+
+/*
+ * pgxc_build_rowmark_entries
+ * Complete type array for SetRemoteStatementName based on given RowMarks list
+ * The list of total parameters is calculated based on the current number of prepared
+ * parameters and the rowmark list.
+ */
+static Oid *
+pgxc_build_rowmark_entries(List *rowMarks, List *rtable, Oid *types, int prepparams, int totparams)
+{
+ Oid *newtypes = types;
+ int rowmark_entry_num;
+ int count = prepparams;
+ ListCell *elt;
+
+ /* No modifications is list is empty */
+ if (rowMarks == NIL)
+ return newtypes;
+
+ /* Nothing to do, total number of parameters is already correct */
+ if (prepparams == totparams)
+ return newtypes;
+
+ /* Fetch number of extra entries related to Rowmarks */
+ rowmark_entry_num = pgxc_count_rowmarks_entries(rowMarks);
+
+ /* Nothing to do */
+ if (rowmark_entry_num == 0)
+ return newtypes;
+
+ /* This needs to be absolutely verified */
+ Assert(totparams == (prepparams + rowmark_entry_num));
+
+ foreach(elt, rowMarks)
+ {
+ PlanRowMark *rc = (PlanRowMark *) lfirst(elt);
+
+ /* RowMarks with different parent are not needed */
+ if (rc->rti != rc->prti)
+ continue;
+
+ /* Determine the correct parameter type */
+ switch (rc->markType)
+ {
+ case ROW_MARK_COPY:
+ {
+ RangeTblEntry *rte = rt_fetch(rc->prti, rtable);
+
+ /*
+ * PGXCTODO: We still need to determine the rowtype
+ * in case relation involved here is a view (see inherit.sql).
+ */
+ if (!OidIsValid(rte->relid))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Cannot generate remote query plan"),
+ errdetail("This relation rowtype cannot be fetched")));
+
+ /*
+ * This is the complete copy of a row, so it is necessary
+ * to set parameter as a rowtype
+ */
+ count++;
+ newtypes[count - 1] = get_rel_type_id(rte->relid);
+ }
+ break;
+
+ case ROW_MARK_REFERENCE:
+ /* Here we have a ctid for sure */
+ count++;
+ newtypes[count - 1] = TIDOID;
+
+ if (rc->isParent)
+ {
+ /* For a parent table, tableoid is also necessary */
+ count++;
+ /* Set parameter type */
+ newtypes[count - 1] = OIDOID;
+ }
+ break;
+
+ /* Ignore other entries */
+ case ROW_MARK_SHARE:
+ case ROW_MARK_EXCLUSIVE:
+ default:
+ break;
+ }
+ }
+
+ /* This should not happen */
+ if (count != totparams)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_EXCEPTION),
+ errmsg("Error when generating remote query plan")));
+
+ return newtypes;
+}
+
+static RangeTblEntry *
+make_dummy_remote_rte(char *relname, Alias *alias)
+{
+ RangeTblEntry *dummy_rte = makeNode(RangeTblEntry);
+ dummy_rte->rtekind = RTE_REMOTE_DUMMY;
+
+ /* use a dummy relname... */
+ dummy_rte->relname = relname;
+ dummy_rte->eref = alias;
+
+ return dummy_rte;
}
+#endif /* XCP */
#endif /* PGXC */
diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c
index be52d16ff0..c0394f787c 100644
--- a/src/backend/optimizer/plan/planagg.c
+++ b/src/backend/optimizer/plan/planagg.c
@@ -17,6 +17,11 @@
* scan all the rows anyway.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -519,6 +524,16 @@ make_agg_subplan(PlannerInfo *root, MinMaxAggInfo *mminfo)
plan->targetlist = subparse->targetList;
+#ifdef XCP
+ /* Set plan distribution */
+ if (mminfo->path->distribution)
+ {
+ plan = (Plan *) make_remotesubplan(subroot, plan, NULL,
+ mminfo->path->distribution,
+ mminfo->path->pathkeys);
+ }
+#endif
+
plan = (Plan *) make_limit(plan,
subparse->limitOffset,
subparse->limitCount,
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 5ff4f501c2..b8e8f6fc11 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -3,6 +3,11 @@
* planner.c
* The query optimizer external interface.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -41,7 +46,7 @@
#ifdef PGXC
#include "commands/prepare.h"
#include "pgxc/pgxc.h"
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
#endif
@@ -104,9 +109,18 @@ static void get_column_info_for_window(PlannerInfo *root, WindowClause *wc,
int *ordNumCols,
AttrNumber **ordColIdx,
Oid **ordOperators);
+#ifdef XCP
+static Plan *grouping_distribution(PlannerInfo *root, Plan *plan,
+ int numGroupCols, AttrNumber *groupColIdx,
+ List *current_pathkeys, Distribution **distribution);
+static bool equal_distributions(PlannerInfo *root, Distribution *dst1,
+ Distribution *dst2);
+#endif
#ifdef PGXC
+#ifndef XCP
static void separate_rowmarks(PlannerInfo *root);
#endif
+#endif
/*****************************************************************************
*
@@ -130,6 +144,7 @@ planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
result = (*planner_hook) (parse, cursorOptions, boundParams);
else
#ifdef PGXC
+#ifndef XCP
/*
* A Coordinator receiving a query from another Coordinator
* is not allowed to go into PGXC planner.
@@ -137,7 +152,8 @@ planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
result = pgxc_planner(parse, cursorOptions, boundParams);
else
-#endif
+#endif /* XCP */
+#endif /* PGXC */
result = standard_planner(parse, cursorOptions, boundParams);
return result;
}
@@ -153,6 +169,12 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
ListCell *lp,
*lr;
+#ifdef XCP
+ if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && parse->utilityStmt &&
+ IsA(parse->utilityStmt, RemoteQuery))
+ return pgxc_direct_planner(parse, cursorOptions, boundParams);
+#endif
+
/* Cursor options may come from caller or from DECLARE CURSOR stmt */
if (parse->utilityStmt &&
IsA(parse->utilityStmt, DeclareCursorStmt))
@@ -211,6 +233,14 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
/* primary planning entry point (may recurse for subqueries) */
top_plan = subquery_planner(glob, parse, NULL,
false, tuple_fraction, &root);
+#ifdef XCP
+ if (root->distribution)
+ {
+ top_plan = (Plan *) make_remotesubplan(root, top_plan, NULL,
+ root->distribution,
+ root->query_pathkeys);
+ }
+#endif
/*
* If creating a plan for a scrollable cursor, make sure it can run
@@ -237,6 +267,35 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
lfirst(lp) = set_plan_references(subroot, subplan);
}
+#ifdef PGXC
+#ifndef XCP
+ /*
+ * PGXC should apply INSERT/UPDATE/DELETE to a Datanode. We are overriding
+ * normal Postgres behavior by modifying final plan or by adding a node on
+ * top of it.
+ * If the optimizer finds out that there is nothing to UPDATE/INSERT/DELETE
+ * in the table/s (say using constraint exclusion), it does not add modify
+ * table plan on the top. We should send queries to the remote nodes only
+ * when there is something to modify.
+ */
+ if (IS_PGXC_COORDINATOR && IsA(top_plan, ModifyTable))
+ switch (parse->commandType)
+ {
+ case CMD_INSERT:
+ top_plan = create_remoteinsert_plan(root, top_plan);
+ break;
+ case CMD_UPDATE:
+ top_plan = create_remoteupdate_plan(root, top_plan);
+ break;
+ case CMD_DELETE:
+ top_plan = create_remotedelete_plan(root, top_plan);
+ break;
+ default:
+ break;
+ }
+#endif /* XCP */
+#endif
+
/* build the PlannedStmt result */
result = makeNode(PlannedStmt);
@@ -256,6 +315,11 @@ standard_planner(Query *parse, int cursorOptions, ParamListInfo boundParams)
result->relationOids = glob->relationOids;
result->invalItems = glob->invalItems;
result->nParamExec = list_length(glob->paramlist);
+#ifdef XCP
+ result->distributionType = LOCATOR_TYPE_NONE;
+ result->distributionKey = InvalidAttrNumber;
+ result->distributionNodes = NULL;
+#endif
return result;
}
@@ -316,8 +380,10 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
root->hasInheritedTarget = false;
#ifdef PGXC
+#ifndef XCP
root->rs_alias_index = 1;
-#endif
+#endif /* XCP */
+#endif /* PGXC */
root->hasRecursion = hasRecursion;
if (hasRecursion)
root->wt_param_id = SS_assign_special_param(root);
@@ -397,6 +463,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
preprocess_rowmarks(root);
#ifdef PGXC
+#ifndef XCP
/*
* In Coordinators we separate row marks in two groups
* one comprises of row marks of types ROW_MARK_EXCLUSIVE & ROW_MARK_SHARE
@@ -415,6 +482,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
*/
separate_rowmarks(root);
#endif
+#endif
/*
* Expand any rangetable entries that are inheritance sets into "append
* relations". This can add entries to the rangetable, but they must be
@@ -584,6 +652,13 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
else
rowMarks = root->rowMarks;
+#ifdef XCP
+ if (root->query_level > 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("INSERT/UPDATE/DELETE is not supported in subquery")));
+#endif
+
plan = (Plan *) make_modifytable(parse->commandType,
parse->canSetTag,
list_make1_int(parse->resultRelation),
@@ -591,9 +666,6 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
returningLists,
rowMarks,
SS_assign_special_param(root));
-#ifdef PGXC
- plan = pgxc_make_modifytable(root, plan);
-#endif
}
}
@@ -610,6 +682,57 @@ subquery_planner(PlannerGlobal *glob, Query *parse,
if (subroot)
*subroot = root;
+ /*
+ * XCPTODO
+ * Temporarily block WITH RECURSIVE for most cases
+ * until we can fix. Allow for pg_catalog tables and replicated tables.
+ */
+ if (root->hasRecursion)
+ {
+ int idx;
+ bool recursiveOk = true;
+
+ /* seems to start at 1... */
+ for (idx = 1; idx < root->simple_rel_array_size - 1; idx++)
+ {
+ RangeTblEntry *rte;
+
+ rte = root->simple_rte_array[idx];
+
+ if (!rte || rte->rtekind == RTE_JOIN)
+ {
+ continue;
+ }
+ else if (rte->rtekind == RTE_RELATION)
+ {
+ char loc_type;
+
+ loc_type = GetRelationLocType(rte->relid);
+
+ /* skip pg_catalog */
+ if (loc_type == LOCATOR_TYPE_NONE)
+ continue;
+
+ /* If replicated, allow */
+ if (IsLocatorReplicated(loc_type))
+ {
+ continue;
+ }
+ else
+ {
+ recursiveOk = false;
+ break;
+ }
+ }
+ else
+ {
+ recursiveOk = false;
+ break;
+ }
+ }
+ if (!recursiveOk)
+ elog(ERROR, "WITH RECURSIVE currently not supported on distributed tables.");
+ }
return plan;
}
@@ -761,9 +884,6 @@ inheritance_planner(PlannerInfo *root)
List *returningLists = NIL;
List *rowMarks;
ListCell *lc;
-#ifdef PGXC
- ModifyTable *mtplan;
-#endif
/*
* We generate a modified instance of the original Query for each target
@@ -882,6 +1002,39 @@ inheritance_planner(PlannerInfo *root)
if (is_dummy_plan(subplan))
continue;
+#ifdef XCP
+ /*
+ * All subplans should have the same distribution, except may be
+ * restriction. At the moment this is always the case but if this
+ * is changed we should handle inheritance differently.
+ * Effectively we want to push the modify table down to data nodes, if
+ * it is running against distributed inherited tables. To achieve this
+ * we are building up distribution of the query from distributions of
+ * the subplans.
+ * If subplans are restricted to different nodes we should union these
+ * restrictions, if at least one subplan is not restricted we should
+ * not restrict parent plan.
+ * After returning a plan from the function valid root->distribution
+ * value will force proper RemoteSubplan node on top of it.
+ */
+ if (root->distribution == NULL)
+ root->distribution = subroot.distribution;
+ else if (!bms_is_empty(root->distribution->restrictNodes))
+ {
+ if (bms_is_empty(subroot.distribution->restrictNodes))
+ {
+ bms_free(root->distribution->restrictNodes);
+ root->distribution->restrictNodes = NULL;
+ }
+ else
+ {
+ root->distribution->restrictNodes = bms_join(
+ root->distribution->restrictNodes,
+ subroot.distribution->restrictNodes);
+ subroot.distribution->restrictNodes = NULL;
+ }
+ }
+#endif
subplans = lappend(subplans, subplan);
/*
@@ -964,20 +1117,13 @@ inheritance_planner(PlannerInfo *root)
rowMarks = root->rowMarks;
/* And last, tack on a ModifyTable node to do the UPDATE/DELETE work */
-#ifdef PGXC
- mtplan = make_modifytable(parse->commandType,
-#else
return (Plan *) make_modifytable(parse->commandType,
-#endif
parse->canSetTag,
resultRelations,
subplans,
returningLists,
rowMarks,
SS_assign_special_param(root));
-#ifdef PGXC
- return pgxc_make_modifytable(root, (Plan *)mtplan);
-#endif
}
/*--------------------
@@ -1012,6 +1158,9 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
double dNumGroups = 0;
bool use_hashed_distinct = false;
bool tested_hashed_distinct = false;
+#ifdef XCP
+ Distribution *distribution = NULL; /* distribution of the result_plan */
+#endif
/* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */
if (parse->limitCount || parse->limitOffset)
@@ -1361,6 +1510,9 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
result_plan = create_plan(root, best_path);
current_pathkeys = best_path->pathkeys;
+#ifdef XCP
+ distribution = best_path->distribution;
+#endif
/* Detect if we'll need an explicit sort for grouping */
if (parse->groupClause && !use_hashed_grouping &&
@@ -1402,16 +1554,15 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
* the desired tlist.
*/
result_plan->targetlist = sub_tlist;
-#ifdef PGXC
- /*
- * If the Join tree is completely shippable, adjust the
- * target list of the query according to the new targetlist
- * set above. For now do this only for SELECT statements.
- */
- if (IsA(result_plan, RemoteQuery) && parse->commandType == CMD_SELECT)
- pgxc_rqplan_adjust_tlist((RemoteQuery *)result_plan);
-#endif /* PGXC */
}
+#ifdef XCP
+ /*
+ * RemoteSubplan is conditionally projection capable - it is
+ * pushing projection to the data nodes
+ */
+ if (IsA(result_plan, RemoteSubplan))
+ result_plan->lefttree->targetlist = sub_tlist;
+#endif
/*
* Also, account for the cost of evaluation of the sub_tlist.
@@ -1456,6 +1607,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
*/
if (use_hashed_grouping)
{
+#ifdef XCP
+ result_plan = grouping_distribution(root, result_plan,
+ numGroupCols, groupColIdx,
+ current_pathkeys,
+ &distribution);
+#endif
/* Hashed aggregate plan --- no sort needed */
result_plan = (Plan *) make_agg(root,
tlist,
@@ -1467,6 +1624,18 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
extract_grouping_ops(parse->groupClause),
numGroups,
result_plan);
+#ifdef PGXC
+#ifndef XCP
+ /*
+ * Grouping will certainly not increase the number of rows
+ * coordinator fetches from datanode, in fact it's expected to
+ * reduce the number drastically. Hence, try pushing GROUP BY
+ * clauses and aggregates to the datanode, thus saving bandwidth.
+ */
+ if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
+ result_plan = create_remoteagg_plan(root, result_plan);
+#endif /* XCP */
+#endif /* PGXC */
/* Hashed aggregation produces randomly-ordered results */
current_pathkeys = NIL;
}
@@ -1500,6 +1669,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
current_pathkeys = NIL;
}
+#ifdef XCP
+ result_plan = grouping_distribution(root, result_plan,
+ numGroupCols, groupColIdx,
+ current_pathkeys,
+ &distribution);
+#endif
result_plan = (Plan *) make_agg(root,
tlist,
(List *) parse->havingQual,
@@ -1530,6 +1705,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
current_pathkeys = root->group_pathkeys;
}
+#ifdef XCP
+ result_plan = grouping_distribution(root, result_plan,
+ numGroupCols, groupColIdx,
+ current_pathkeys,
+ &distribution);
+#endif
result_plan = (Plan *) make_group(root,
tlist,
(List *) parse->havingQual,
@@ -1553,12 +1734,18 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
* this routine to avoid having to generate the plan in the
* first place.
*/
+#ifdef XCP
+ result_plan = grouping_distribution(root, result_plan, 0, NULL,
+ current_pathkeys,
+ &distribution);
+#endif
result_plan = (Plan *) make_result(root,
tlist,
parse->havingQual,
NULL);
}
#ifdef PGXC
+#ifndef XCP
/*
* Grouping will certainly not increase the number of rows
* Coordinator fetches from Datanode, in fact it's expected to
@@ -1567,6 +1754,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
*/
if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
result_plan = create_remotegrouping_plan(root, result_plan);
+#endif /* XCP */
#endif /* PGXC */
} /* end of non-minmax-aggregate case */
@@ -1626,6 +1814,21 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
window_tlist = add_volatile_sort_exprs(window_tlist, tlist,
activeWindows);
result_plan->targetlist = (List *) copyObject(window_tlist);
+#ifdef XCP
+ /*
+ * We can not guarantee correct result of windowing function
+ * if aggregation is pushed down to Datanodes. So if current plan
+ * produces a distributed result set we should bring it to
+ * coordinator.
+ */
+ if (distribution)
+ {
+ result_plan = (Plan *)
+ make_remotesubplan(root, result_plan, NULL,
+ distribution, current_pathkeys);
+ distribution = NULL;
+ }
+#endif
foreach(l, activeWindows)
{
@@ -1667,6 +1870,30 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
result_plan = (Plan *) sort_plan;
current_pathkeys = window_pathkeys;
}
+#ifdef XCP
+ /*
+ * In our code, Sort may be pushed down to the Datanodes,
+ * and therefore we may get the sort_plan is not really a
+ * Sort node. In this case we should get sort columns from
+ * the top RemoteSubplan
+ */
+ if (!IsA(sort_plan, Sort))
+ {
+ RemoteSubplan *pushdown;
+ pushdown = find_push_down_plan(sort_plan, true);
+ Assert(pushdown && pushdown->sort);
+ get_column_info_for_window(root, wc, tlist,
+ pushdown->sort->numCols,
+ pushdown->sort->sortColIdx,
+ &partNumCols,
+ &partColIdx,
+ &partOperators,
+ &ordNumCols,
+ &ordColIdx,
+ &ordOperators);
+ }
+ else
+#endif
/* In either case, extract the per-column information */
get_column_info_for_window(root, wc, tlist,
sort_plan->numCols,
@@ -1766,6 +1993,14 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
if (use_hashed_distinct)
{
+#ifdef XCP
+ result_plan = grouping_distribution(root, result_plan,
+ list_length(parse->distinctClause),
+ extract_grouping_cols(parse->distinctClause,
+ result_plan->targetlist),
+ current_pathkeys,
+ &distribution);
+#endif
/* Hashed aggregate plan --- no sort needed */
result_plan = (Plan *) make_agg(root,
result_plan->targetlist,
@@ -1822,6 +2057,14 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
-1.0);
}
+#ifdef XCP
+ result_plan = grouping_distribution(root, result_plan,
+ list_length(parse->distinctClause),
+ extract_grouping_cols(parse->distinctClause,
+ result_plan->targetlist),
+ current_pathkeys,
+ &distribution);
+#endif
result_plan = (Plan *) make_unique(result_plan,
parse->distinctClause);
result_plan->plan_rows = dNumDistinctRows;
@@ -1841,11 +2084,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
result_plan,
root->sort_pathkeys,
limit_tuples);
-#ifdef PGXC
- if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
- result_plan = (Plan *) create_remotesort_plan(root,
- result_plan);
-#endif /* PGXC */
current_pathkeys = root->sort_pathkeys;
}
}
@@ -1874,16 +2112,21 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
*/
if (parse->limitCount || parse->limitOffset)
{
+#ifdef XCP
+ /* We should put Limit on top of distributed results */
+ if (distribution)
+ {
+ result_plan = (Plan *)
+ make_remotesubplan(root, result_plan, NULL,
+ distribution, current_pathkeys);
+ distribution = NULL;
+ }
+#endif
result_plan = (Plan *) make_limit(result_plan,
parse->limitOffset,
parse->limitCount,
offset_est,
count_est);
-#ifdef PGXC
- /* See if we can push LIMIT or OFFSET clauses to Datanodes */
- if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
- result_plan = (Plan *) create_remotelimit_plan(root, result_plan);
-#endif /* PGXC */
}
/*
@@ -1892,6 +2135,155 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
*/
root->query_pathkeys = current_pathkeys;
+#ifdef XCP
+ /*
+ * Adjust query distribution if requested
+ */
+ if (root->distribution)
+ {
+ if (equal_distributions(root, root->distribution, distribution))
+ {
+ if (IsLocatorReplicated(distribution->distributionType) &&
+ contain_volatile_functions((Node *) result_plan->targetlist))
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ errmsg("can not update replicated table with result of volatile function")));
+ /*
+ * Source tuple will be consumed on the same node where it is
+ * produced, so if it is known that some node does not yield tuples
+ * we do not want to send subquery for execution on these nodes
+ * at all.
+ * So copy the restriction to the external distribution.
+ * XXX Is that ever possible if external restriction is already
+ * defined? If yes we probably should use intersection of the sets,
+ * and if resulting set is empty create dummy plan and set it as
+ * the result_plan. Need to think this over
+ */
+ root->distribution->restrictNodes =
+ bms_copy(distribution->restrictNodes);
+ }
+ else
+ {
+ RemoteSubplan *distributePlan;
+ /*
+ * If the planned statement is either UPDATE or DELETE different
+ * distributions here mean the ModifyTable node will be placed on
+ * top of RemoteSubquery. UPDATE and DELETE versions of ModifyTable
+ * use TID of incoming tuple to apply the changes, but the
+ * RemoteSubquery node supplies RemoteTuples, without such field.
+ * Therefore we can not execute such plan.
+ * Most common case is when UPDATE statement modifies the
+ * distribution column. Also incorrect distributed plan is possible
+ * if planning a complex UPDATE or DELETE statement involving table
+ * join.
+ * We output different error messages in UPDATE and DELETE cases
+ * mostly for compatibility with PostgresXC. It is hard to determine
+ * here, if such plan is because updated partitioning key or poorly
+ * planned join, so in case of UPDATE we assume the first case as
+ * more probable, for DELETE the second case is only possible.
+ * The error message may be misleading, if that is UPDATE and join,
+ * but hope we will target distributed update problem soon.
+ * There are two ways of fixing that:
+ * 1. Improve distribution planner to never consider to redistribute
+ * target table. So if planner finds that it has no choice, it would
+ * throw error somewhere else. So here we only be catching cases of
+ * updating distribution columns.
+ * 2. Modify executor and allow distribution column updates. However
+ * there are a lot of issues behind the scene when implementing that
+ * approach.
+ */
+ if (parse->commandType == CMD_UPDATE)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ errmsg("could not plan this distributed update"),
+ errdetail("correlated UPDATE or updating distribution column currently not supported in Postgres-XL.")));
+ if (parse->commandType == CMD_DELETE)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ errmsg("could not plan this distributed delete"),
+ errdetail("correlated or complex DELETE is currently not supported in Postgres-XL.")));
+
+ /*
+ * Redistribute result according to requested distribution.
+ */
+ if ((distributePlan = find_push_down_plan(result_plan, true)))
+ {
+ Bitmapset *tmpset;
+ int nodenum;
+
+ distributePlan->distributionType = root->distribution->distributionType;
+ distributePlan->distributionKey = InvalidAttrNumber;
+ if (root->distribution->distributionExpr)
+ {
+ ListCell *lc;
+
+ /* Find distribution expression in the target list */
+ foreach(lc, distributePlan->scan.plan.targetlist)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+
+ if (equal(tle->expr, root->distribution->distributionExpr))
+ {
+ distributePlan->distributionKey = tle->resno;
+ break;
+ }
+ }
+
+ if (distributePlan->distributionKey == InvalidAttrNumber)
+ {
+ Plan *lefttree = distributePlan->scan.plan.lefttree;
+ Plan *plan;
+ TargetEntry *newtle;
+
+ /* The expression is not found, need to add junk */
+ newtle = makeTargetEntry((Expr *) root->distribution->distributionExpr,
+ list_length(lefttree->targetlist) + 1,
+ NULL,
+ true);
+
+ if (is_projection_capable_plan(lefttree))
+ {
+ /* Ok to modify subplan's target list */
+ lefttree->targetlist = lappend(lefttree->targetlist,
+ newtle);
+ }
+ else
+ {
+ /* Use Result node to calculate expression */
+ List *newtlist = list_copy(lefttree->targetlist);
+ newtlist = lappend(newtlist, newtle);
+ lefttree = (Plan *) make_result(root, newtlist, NULL, lefttree);
+ distributePlan->scan.plan.lefttree = lefttree;
+ }
+ /* Update all the hierarchy */
+ for (plan = result_plan; plan != lefttree; plan = plan->lefttree)
+ plan->targetlist = lefttree->targetlist;
+ }
+ }
+ tmpset = bms_copy(root->distribution->nodes);
+ distributePlan->distributionNodes = NIL;
+ while ((nodenum = bms_first_member(tmpset)) >= 0)
+ distributePlan->distributionNodes = lappend_int(
+ distributePlan->distributionNodes, nodenum);
+ bms_free(tmpset);
+ }
+ else
+ result_plan = (Plan *) make_remotesubplan(root,
+ result_plan,
+ root->distribution,
+ distribution,
+ NULL);
+ }
+ }
+ else
+ {
+ /*
+ * Inform caller about distribution of the subplan
+ */
+ root->distribution = distribution;
+ }
+#endif
+
return result_plan;
}
@@ -2086,6 +2478,7 @@ preprocess_rowmarks(PlannerInfo *root)
}
#ifdef PGXC
+#ifndef XCP
/*
* separate_rowmarks - In XC Coordinators are supposed to skip handling
* of type ROW_MARK_EXCLUSIVE & ROW_MARK_SHARE.
@@ -2120,7 +2513,7 @@ separate_rowmarks(PlannerInfo *root)
root->rowMarks = rml_2;
root->xc_rowMarks = rml_1;
}
-
+#endif /*XCP*/
#endif /*PGXC*/
/*
@@ -3400,3 +3793,85 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid)
return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost);
}
+
+
+#ifdef XCP
+/*
+ * Grouping preserves distribution if distribution key is the
+ * first grouping key or if distribution is replicated.
+ * In these cases aggregation is fully pushed down to nodes.
+ * Otherwise we need 2-phase aggregation so put remote subplan
+ * on top of the result_plan. When adding result agg on top of
+ * RemoteSubplan first aggregation phase will be pushed down
+ * automatically.
+ */
+static Plan *
+grouping_distribution(PlannerInfo *root, Plan *plan,
+ int numGroupCols, AttrNumber *groupColIdx,
+ List *current_pathkeys, Distribution **distribution)
+{
+ if (*distribution &&
+ !IsLocatorReplicated((*distribution)->distributionType) &&
+ (numGroupCols == 0 ||
+ (*distribution)->distributionExpr == NULL ||
+ !equal(((TargetEntry *)list_nth(plan->targetlist, groupColIdx[0]-1))->expr,
+ (*distribution)->distributionExpr)))
+ {
+ Plan *result_plan;
+ result_plan = (Plan *) make_remotesubplan(root, plan, NULL,
+ *distribution,
+ current_pathkeys);
+ *distribution = NULL;
+ return result_plan;
+ }
+ return plan;
+}
+
+
+/*
+ * Check if two distributions are equal.
+ * Distributions are considered equal if they are of the same type, on the same
+ * nodes and if they have distribution expressions defined they are equal
+ * (either the same expressions or they are member of the same equivalence
+ * class)
+ */
+static bool
+equal_distributions(PlannerInfo *root, Distribution *dst1,
+ Distribution *dst2)
+{
+ /* fast path */
+ if (dst1 == dst2)
+ return true;
+ if (dst1 == NULL || dst2 == NULL)
+ return false;
+
+ /* Conditions that easier to check go first */
+ if (dst1->distributionType != dst2->distributionType)
+ return false;
+
+ if (!bms_equal(dst1->nodes, dst2->nodes))
+ return false;
+
+ if (equal(dst1->distributionExpr, dst2->distributionExpr))
+ return true;
+
+ /*
+ * For more thorough expression check we need to ensure they both are
+ * defined
+ */
+ if (dst1->distributionExpr == NULL || dst2->distributionExpr == NULL)
+ return false;
+
+ /*
+ * More thorough check, but allows some important cases, like if
+ * distribution column is not updated (implicit set distcol=distcol) or
+ * set distcol = CONST, ... WHERE distcol = CONST - pattern used by many
+ * applications
+ */
+ if (exprs_known_equal(root, dst1->distributionExpr, dst2->distributionExpr))
+ return true;
+
+ /* The restrictNodes field does not matter for distribution equality */
+ return false;
+}
+#endif
diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c
index 5187d27ae9..691b6d0909 100644
--- a/src/backend/optimizer/plan/setrefs.c
+++ b/src/backend/optimizer/plan/setrefs.c
@@ -4,6 +4,11 @@
* Post-processing of a completed plan tree: fix references to subplan
* vars, compute regproc values for operators, etc
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -26,8 +31,7 @@
#include "utils/lsyscache.h"
#include "utils/syscache.h"
#ifdef PGXC
-#include "pgxc/pgxc.h"
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
#endif
@@ -69,6 +73,9 @@ typedef struct
indexed_tlist *subplan_itlist;
Index newvarno;
int rtoffset;
+#ifdef XCP
+ bool agg_master;
+#endif
} fix_upper_expr_context;
typedef struct
@@ -77,7 +84,6 @@ typedef struct
indexed_tlist *base_itlist;
int rtoffset;
Index relid;
- bool return_non_base_vars; /* Should we reject or return vars not found in base_itlist */
} fix_remote_expr_context;
/*
@@ -127,11 +133,20 @@ static List *fix_join_expr(PlannerInfo *root,
Index acceptable_rel, int rtoffset);
static Node *fix_join_expr_mutator(Node *node,
fix_join_expr_context *context);
+#ifdef XCP
+static Node *fix_upper_expr(PlannerInfo *root,
+ Node *node,
+ indexed_tlist *subplan_itlist,
+ Index newvarno,
+ int rtoffset,
+ bool agg_master);
+#else
static Node *fix_upper_expr(PlannerInfo *root,
Node *node,
indexed_tlist *subplan_itlist,
Index newvarno,
int rtoffset);
+#endif
static Node *fix_upper_expr_mutator(Node *node,
fix_upper_expr_context *context);
static List *set_returning_clause_references(PlannerInfo *root,
@@ -144,18 +159,20 @@ static bool extract_query_dependencies_walker(Node *node,
PlannerInfo *context);
#ifdef PGXC
+#ifndef XCP
/* References for remote plans */
static List * fix_remote_expr(PlannerInfo *root,
List *clauses,
indexed_tlist *base_itlist,
Index newrelid,
- int rtoffset,
- bool return_non_base_vars);
+ int rtoffset);
static Node *fix_remote_expr_mutator(Node *node,
fix_remote_expr_context *context);
static void set_remote_references(PlannerInfo *root, RemoteQuery *rscan, int rtoffset);
-static void pgxc_set_agg_references(PlannerInfo *root, Agg *aggplan);
-static List *set_remote_returning_refs(PlannerInfo *root, List *rlist, Plan *topplan, Index relid, int rtoffset);
+#endif
+#endif
+#ifdef XCP
+static void set_remotesubplan_references(PlannerInfo *root, Plan *plan, int rtoffset);
#endif
@@ -454,6 +471,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
}
break;
#ifdef PGXC
+#ifndef XCP
case T_RemoteQuery:
{
RemoteQuery *splan = (RemoteQuery *) plan;
@@ -474,6 +492,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
}
break;
#endif
+#endif
case T_ForeignScan:
{
ForeignScan *splan = (ForeignScan *) plan;
@@ -487,6 +506,11 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
fix_scan_list(root, splan->fdw_exprs, rtoffset);
}
break;
+#ifdef XCP
+ case T_RemoteSubplan:
+ set_remotesubplan_references(root, plan, rtoffset);
+ break;
+#endif /* XCP */
case T_NestLoop:
case T_MergeJoin:
case T_HashJoin:
@@ -555,11 +579,6 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
}
break;
case T_Agg:
-#ifdef PGXC
- /* If the lower plan is RemoteQuery plan, adjust the aggregates */
- pgxc_set_agg_references(root, (Agg *)plan);
- /* Fall through */
-#endif /* PGXC */
case T_Group:
set_upper_references(root, plan, rtoffset);
break;
@@ -605,12 +624,6 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
case T_ModifyTable:
{
ModifyTable *splan = (ModifyTable *) plan;
-#ifdef PGXC
- int n = 0;
- List *firstRetList; /* First returning list required for
- * setting up visible plan target list
- */
-#endif
Assert(splan->plan.targetlist == NIL);
Assert(splan->plan.qual == NIL);
@@ -635,48 +648,7 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
List *rlist = (List *) lfirst(lcrl);
Index resultrel = lfirst_int(lcrr);
Plan *subplan = (Plan *) lfirst(lcp);
-#ifdef PGXC
- RemoteQuery *rq = NULL;
-
- if (n == 0)
- {
- /*
- * Set up first returning list before we change
- * var references to point to RTE_REMOTE_DUMMY
- */
- firstRetList = set_returning_clause_references(root,
- rlist,
- subplan,
- resultrel,
- rtoffset);
- /* Restore the returning list changed by the above call */
- rlist = (List *) lfirst(lcrl);
- }
-
- if (splan->remote_plans)
- rq = (RemoteQuery *)list_nth(splan->remote_plans, n);
- n++;
-
- if(rq != NULL && IS_PGXC_COORDINATOR && !IsConnFromCoord())
- {
- /*
- * Set references of returning clause by adjusting
- * varno/varattno according to target list in
- * remote query node
- */
- rlist = set_remote_returning_refs(root,
- rlist,
- (Plan *)rq,
- rq->scan.scanrelid,
- rtoffset);
- /*
- * The next call to set_returning_clause_references
- * should skip the vars already taken care of by
- * the above call to set_remote_returning_refs
- */
- resultrel = rq->scan.scanrelid;
- }
-#endif
+
rlist = set_returning_clause_references(root,
rlist,
subplan,
@@ -686,16 +658,6 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
}
splan->returningLists = newRL;
-#ifdef PGXC
- /*
- * In XC we do not need to set the target list as the
- * first RETURNING list from the finalized list because
- * it can contain vars referring to RTE_REMOTE_DUMMY.
- * We therefore create a list before fixing
- * remote returning references and use that here.
- */
- splan->plan.targetlist = copyObject(firstRetList);
-#else
/*
* Set up the visible plan targetlist as being the same as
* the first RETURNING list. This is for the use of
@@ -705,7 +667,6 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
* twice on identical targetlists.
*/
splan->plan.targetlist = copyObject(linitial(newRL));
-#endif
}
foreach(l, splan->resultRelations)
@@ -736,34 +697,6 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset)
root->glob->resultRelations =
list_concat(root->glob->resultRelations,
list_copy(splan->resultRelations));
-
-#ifdef PGXC
- /* Adjust references of remote query nodes in ModifyTable node */
- if(IS_PGXC_COORDINATOR && !IsConnFromCoord())
- {
- ListCell *elt;
- RemoteQuery *rq;
-
- foreach(elt, splan->remote_plans)
- {
- rq = (RemoteQuery *) lfirst(elt);
- /*
- * If base_tlist is set, it means that we have a reduced remote
- * query plan. So need to set the var references accordingly.
- */
- if (rq->base_tlist)
- set_remote_references(root, rq, rtoffset);
- rq->scan.plan.targetlist = fix_scan_list(root,
- rq->scan.plan.targetlist,
- rtoffset);
- rq->scan.plan.qual = fix_scan_list(root,
- rq->scan.plan.qual,
- rtoffset);
- rq->base_tlist = fix_scan_list(root, rq->base_tlist, rtoffset);
- rq->scan.scanrelid += rtoffset;
- }
- }
-#endif
}
break;
case T_Append:
@@ -876,6 +809,22 @@ set_indexonlyscan_references(PlannerInfo *root,
index_itlist = build_tlist_index(plan->indextlist);
plan->scan.scanrelid += rtoffset;
+#ifdef XCP
+ plan->scan.plan.targetlist = (List *)
+ fix_upper_expr(root,
+ (Node *) plan->scan.plan.targetlist,
+ index_itlist,
+ INDEX_VAR,
+ rtoffset,
+ false);
+ plan->scan.plan.qual = (List *)
+ fix_upper_expr(root,
+ (Node *) plan->scan.plan.qual,
+ index_itlist,
+ INDEX_VAR,
+ rtoffset,
+ false);
+#else
plan->scan.plan.targetlist = (List *)
fix_upper_expr(root,
(Node *) plan->scan.plan.targetlist,
@@ -888,6 +837,7 @@ set_indexonlyscan_references(PlannerInfo *root,
index_itlist,
INDEX_VAR,
rtoffset);
+#endif
/* indexqual is already transformed to reference index columns */
plan->indexqual = fix_scan_list(root, plan->indexqual, rtoffset);
/* indexorderby is already transformed to reference index columns */
@@ -1260,11 +1210,20 @@ set_join_references(PlannerInfo *root, Join *join, int rtoffset)
{
NestLoopParam *nlp = (NestLoopParam *) lfirst(lc);
+#ifdef XCP
+ nlp->paramval = (Var *) fix_upper_expr(root,
+ (Node *) nlp->paramval,
+ outer_itlist,
+ OUTER_VAR,
+ rtoffset,
+ false);
+#else
nlp->paramval = (Var *) fix_upper_expr(root,
(Node *) nlp->paramval,
outer_itlist,
OUTER_VAR,
rtoffset);
+#endif
/* Check we replaced any PlaceHolderVar with simple Var */
if (!(IsA(nlp->paramval, Var) &&
nlp->paramval->varno == OUTER_VAR))
@@ -1323,6 +1282,12 @@ set_upper_references(PlannerInfo *root, Plan *plan, int rtoffset)
indexed_tlist *subplan_itlist;
List *output_targetlist;
ListCell *l;
+#ifdef XCP
+ bool agg_master;
+
+ agg_master = (IsA(plan, Agg) &&
+ ((Agg *) plan)->aggdistribution == AGG_MASTER);
+#endif
subplan_itlist = build_tlist_index(subplan->targetlist);
@@ -1341,18 +1306,36 @@ set_upper_references(PlannerInfo *root, Plan *plan, int rtoffset)
subplan_itlist,
OUTER_VAR);
if (!newexpr)
+#ifdef XCP
+ newexpr = fix_upper_expr(root,
+ (Node *) tle->expr,
+ subplan_itlist,
+ OUTER_VAR,
+ rtoffset,
+ agg_master);
+#else
newexpr = fix_upper_expr(root,
(Node *) tle->expr,
subplan_itlist,
OUTER_VAR,
rtoffset);
+#endif
}
else
+#ifdef XCP
+ newexpr = fix_upper_expr(root,
+ (Node *) tle->expr,
+ subplan_itlist,
+ OUTER_VAR,
+ rtoffset,
+ agg_master);
+#else
newexpr = fix_upper_expr(root,
(Node *) tle->expr,
subplan_itlist,
OUTER_VAR,
rtoffset);
+#endif
tle = flatCopyTargetEntry(tle);
tle->expr = (Expr *) newexpr;
output_targetlist = lappend(output_targetlist, tle);
@@ -1360,12 +1343,20 @@ set_upper_references(PlannerInfo *root, Plan *plan, int rtoffset)
plan->targetlist = output_targetlist;
plan->qual = (List *)
+#ifdef XCP
+ fix_upper_expr(root,
+ (Node *) plan->qual,
+ subplan_itlist,
+ OUTER_VAR,
+ rtoffset,
+ agg_master);
+#else
fix_upper_expr(root,
(Node *) plan->qual,
subplan_itlist,
OUTER_VAR,
rtoffset);
-
+#endif
pfree(subplan_itlist);
}
@@ -1594,6 +1585,34 @@ search_indexed_tlist_for_non_var(Node *node,
return NULL; /* no match */
}
+#ifdef PGXC
+#ifndef XCP
+/*
+ * search_tlist_for_var --- find a Var in the provided tlist. This does a
+ * basic scan through the list. So not very efficient...
+ *
+ * If no match, return NULL.
+ *
+ */
+Var *
+search_tlist_for_var(Var *var, List *jtlist)
+{
+ Index varno = var->varno;
+ AttrNumber varattno = var->varattno;
+ ListCell *l;
+
+ foreach(l, jtlist)
+ {
+ Var *listvar = (Var *) lfirst(l);
+
+ if (listvar->varno == varno && listvar->varattno == varattno)
+ return var;
+ }
+ return NULL; /* no match */
+}
+#endif
+#endif
+
/*
* search_indexed_tlist_for_sortgroupref --- find a sort/group expression
* (which is assumed not to be just a Var)
@@ -1798,12 +1817,22 @@ fix_join_expr_mutator(Node *node, fix_join_expr_context *context)
* varno = newvarno, varattno = resno of corresponding targetlist element.
* The original tree is not modified.
*/
+#ifdef XCP
+static Node *
+fix_upper_expr(PlannerInfo *root,
+ Node *node,
+ indexed_tlist *subplan_itlist,
+ Index newvarno,
+ int rtoffset,
+ bool agg_master)
+#else
static Node *
fix_upper_expr(PlannerInfo *root,
Node *node,
indexed_tlist *subplan_itlist,
Index newvarno,
int rtoffset)
+#endif
{
fix_upper_expr_context context;
@@ -1811,6 +1840,9 @@ fix_upper_expr(PlannerInfo *root,
context.subplan_itlist = subplan_itlist;
context.newvarno = newvarno;
context.rtoffset = rtoffset;
+#ifdef XCP
+ context.agg_master = agg_master;
+#endif
return fix_upper_expr_mutator(node, &context);
}
@@ -1855,6 +1887,16 @@ fix_upper_expr_mutator(Node *node, fix_upper_expr_context *context)
newvar = search_indexed_tlist_for_non_var(node,
context->subplan_itlist,
context->newvarno);
+#ifdef XCP
+ if (newvar && context->agg_master && IsA(node, Aggref))
+ {
+ TargetEntry *newtle;
+ Aggref *newnode = copyObject(node);
+ newtle = makeTargetEntry((Expr *) newvar, 1, NULL, false);
+ newnode->args = list_make1(newtle);
+ return (Node *) newnode;
+ }
+#endif
if (newvar)
return (Node *) newvar;
}
@@ -2104,6 +2146,10 @@ extract_query_dependencies_walker(Node *node, PlannerInfo *context)
return expression_tree_walker(node, extract_query_dependencies_walker,
(void *) context);
}
+
+
+#ifdef PGXC
+#ifndef XCP
/*
* fix_remote_expr
* Create a new set of targetlist entries or qual clauses by
@@ -2115,9 +2161,6 @@ extract_query_dependencies_walker(Node *node, PlannerInfo *context)
* 'clauses' is the targetlist or list of clauses
* 'base_itlist' is the indexed target list of the base referenced relations
*
- * 'return_non_base_vars' lets the caller decide whether to reject
- * or return vars not found in base_itlist
- *
* Returns the new expression tree. The original clause structure is
* not modified.
*/
@@ -2126,8 +2169,7 @@ fix_remote_expr(PlannerInfo *root,
List *clauses,
indexed_tlist *base_itlist,
Index newrelid,
- int rtoffset,
- bool return_non_base_vars)
+ int rtoffset)
{
fix_remote_expr_context context;
@@ -2135,7 +2177,6 @@ fix_remote_expr(PlannerInfo *root,
context.base_itlist = base_itlist;
context.relid = newrelid;
context.rtoffset = rtoffset;
- context.return_non_base_vars = return_non_base_vars;
return (List *) fix_remote_expr_mutator((Node *) clauses, &context);
}
@@ -2160,10 +2201,6 @@ fix_remote_expr_mutator(Node *node, fix_remote_expr_context *context)
if (newvar)
return (Node *) newvar;
- /* If it's not found in base_itlist, return it if required */
- if (context->return_non_base_vars && var->varno != context->relid)
- return (Node *) var;
-
/* No reference found for Var */
elog(ERROR, "variable not found in base remote scan target lists");
}
@@ -2204,172 +2241,93 @@ set_remote_references(PlannerInfo *root, RemoteQuery *rscan, int rtoffset)
rscan->scan.plan.targetlist,
base_itlist,
rscan->scan.scanrelid,
- rtoffset,
- false);
+ rtoffset);
rscan->scan.plan.qual = fix_remote_expr(root ,
rscan->scan.plan.qual,
base_itlist,
rscan->scan.scanrelid,
- rtoffset,
- false);
+ rtoffset);
pfree(base_itlist);
}
-/*
- * set_remote_returning_refs
- *
- * Fix references of remote returning list to point
- * to reference target list values from the base
- * relation target lists
- */
-
-static List *
-set_remote_returning_refs(PlannerInfo *root,
- List *rlist,
- Plan *topplan,
- Index relid,
- int rtoffset)
+Node *
+pgxc_fix_scan_expr(PlannerInfo *root, Node *node, int rtoffset)
{
- indexed_tlist *base_itlist;
-
- base_itlist = build_tlist_index(topplan->targetlist);
-
- rlist = fix_remote_expr(root,
- rlist,
- base_itlist,
- relid,
- rtoffset,
- true);
-
- pfree(base_itlist);
-
- return rlist;
+ return fix_scan_expr(root, node, rtoffset);
}
+#endif /* XCP */
+#endif /* PGXC */
-#ifdef PGXC
+
+#ifdef XCP
/*
- * For Agg plans, if the lower scan plan is a RemoteQuery node, adjust the
- * Aggref nodes to pull the transition results from the datanodes. We do while
- * setting planner references so that the upper nodes will find the nodes that
- * they expect in Agg plans.
+ * set_remotesubplan_references
+ * Usually RemoteSubplan node does just translates its target list, so it is
+ * enought to invoke fix_scan_list here. One exception is if the
+ * RemoteSubplan is set on top of ModifyTable. In this case target lists of both
+ * these plan nodes are NIL. If the subplan is not returning we want to leave
+ * target list NIL, if yes, we should make up target list as a list of simple
+ * references to entries from the first returning list.
+ * The qual of RemoteSubplan is always NULL.
*/
-void
-pgxc_set_agg_references(PlannerInfo *root, Agg *aggplan)
+static void
+set_remotesubplan_references(PlannerInfo *root, Plan *plan, int rtoffset)
{
- RemoteQuery *rqplan = (RemoteQuery *)aggplan->plan.lefttree;
- Sort *srtplan;
- List *aggs_n_vars;
- ListCell *lcell;
- List *nodes_to_modify;
- List *rq_nodes_to_modify;
- List *srt_nodes_to_modify;
-
- /* Lower plan tree can be Sort->RemoteQuery or RemoteQuery */
- if (IsA(rqplan, Sort))
+ if (plan->targetlist == NIL)
{
- srtplan = (Sort *)rqplan;
- rqplan = (RemoteQuery *)srtplan->plan.lefttree;
- }
- else
- srtplan = NULL;
-
- if (!IsA(rqplan, RemoteQuery))
- return;
-
- Assert(IS_PGXC_COORDINATOR && !IsConnFromCoord());
- /*
- * If there are not transition results expected from lower plans, nothing to
- * be done here.
- */
- if (!aggplan->skip_trans)
- return;
-
- /* Gather all the aggregates from all the targetlists that need fixing */
- nodes_to_modify = list_copy(aggplan->plan.targetlist);
- nodes_to_modify = list_concat(nodes_to_modify, aggplan->plan.qual);
- aggs_n_vars = pull_var_clause((Node *)nodes_to_modify, PVC_INCLUDE_AGGREGATES,
- PVC_RECURSE_PLACEHOLDERS);
- rq_nodes_to_modify = NIL;
- srt_nodes_to_modify = NIL;
- /*
- * For every aggregate, find corresponding aggregate in the lower plan and
- * modify it correctly.
- */
- foreach (lcell, aggs_n_vars)
- {
- Aggref *aggref = lfirst(lcell);
- TargetEntry *tle;
- Aggref *rq_aggref;
- Aggref *srt_aggref;
- Aggref *arg_aggref; /* Aggref to be set as Argument to the
- * aggref in the Agg plan */
-
- /* Only Aggref expressions need modifications */
- if (!IsA(aggref, Aggref))
+ ModifyTable *mt = (ModifyTable *) plan->lefttree;
+ if (IsA(mt, ModifyTable) && mt->returningLists)
{
- Assert(IsA(aggref, Var));
- continue;
- }
+ List *returningList;
+ List *output_targetlist;
+ ListCell *l;
- tle = tlist_member((Node *)aggref, rqplan->scan.plan.targetlist);
- if (!tle)
- elog(ERROR, "Could not find the Aggref node");
- rq_aggref = (Aggref *)tle->expr;
- Assert(equal(rq_aggref, aggref));
- /*
- * Remember the Aggref nodes of which we need to modify. This is done so
- * that, if there multiple copies of same aggregate, we will match all
- * of them
- */
- rq_nodes_to_modify = list_append_unique(rq_nodes_to_modify, rq_aggref);
- arg_aggref = rq_aggref;
+ returningList = (List *) linitial(mt->returningLists);
+ output_targetlist = NIL;
+ foreach(l, returningList)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(l);
+ Var *newvar;
+
+ newvar = makeVar(OUTER_VAR,
+ tle->resno,
+ exprType((Node *) tle->expr),
+ exprTypmod((Node *) tle->expr),
+ exprCollation((Node *) tle->expr),
+ 0);
+ if (IsA(tle->expr, Var))
+ {
+ newvar->varnoold = ((Var *) tle->expr)->varno + rtoffset;
+ newvar->varoattno = ((Var *) tle->expr)->varattno;
+ }
+ else
+ {
+ newvar->varnoold = 0; /* wasn't ever a plain Var */
+ newvar->varoattno = 0;
+ }
- /*
- * If there is a Sort plan, get corresponding expression from there as
- * well and remember it to be modified.
- */
- if (srtplan)
- {
- tle = tlist_member((Node *)rq_aggref, srtplan->plan.targetlist);
- if (!tle)
- elog(ERROR, "Could not find the Aggref node");
- srt_aggref = (Aggref *)tle->expr;
- Assert(equal(srt_aggref, rq_aggref));
- srt_nodes_to_modify = list_append_unique(srt_nodes_to_modify,
- srt_aggref);
- arg_aggref = srt_aggref;
+ tle = flatCopyTargetEntry(tle);
+ tle->expr = (Expr *) newvar;
+ output_targetlist = lappend(output_targetlist, tle);
+ }
+ plan->targetlist = output_targetlist;
}
-
- /*
- * The transition result from the datanodes acts as an input to the
- * Aggref node on coordinator.
- */
- aggref->args = list_make1(makeTargetEntry((Expr *)arg_aggref, 1, NULL,
- false));
- }
-
- /* Modify the transition types now */
- foreach (lcell, rq_nodes_to_modify)
- {
- Aggref *rq_aggref = lfirst(lcell);
- Assert(IsA(rq_aggref, Aggref));
- rq_aggref->aggtype = rq_aggref->aggtrantype;
}
- foreach (lcell, srt_nodes_to_modify)
+ else
{
- Aggref *srt_aggref = lfirst(lcell);
- Assert(IsA(srt_aggref, Aggref));
- srt_aggref->aggtype = srt_aggref->aggtrantype;
+ /*
+ * The RemoteSubplan may look like a subject for a dummy tlist.
+ * It works in most cases. However it may be a subplan of a ModifyTable
+ * running against a relation with dropped columns. Sanity check assumes
+ * that subplan will return a NULL constant as a value for the dropped
+ * column, however set_dummy_tlist_references would replace it with a
+ * Var. We cannot detemine the parent plan here, so just process it as
+ * a scan. Executor will ignore this anyway.
+ */
+ plan->targetlist = fix_scan_list(root, plan->targetlist, rtoffset);
}
-
- /*
- * We have modified the targetlist of the RemoteQuery plan below the Agg
- * plan. Adjust its targetlist as well.
- */
- pgxc_rqplan_adjust_tlist(rqplan);
-
- return;
+ Assert(plan->qual == NULL);
}
-#endif /* PGXC */
+#endif
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 2328c4b66e..ea5363ab07 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -3,6 +3,11 @@
* subselect.c
* Planning routines for subselects and parameters.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -496,6 +501,22 @@ make_subplan(PlannerInfo *root, Query *orig_subquery, SubLinkType subLinkType,
root,
false, tuple_fraction,
&subroot);
+#ifdef XCP
+ if (subroot->distribution)
+ {
+ plan = (Plan *) make_remotesubplan(subroot,
+ plan,
+ NULL,
+ subroot->distribution,
+ subroot->query_pathkeys);
+ /*
+ * SS_finalize_plan has already been run on the subplan,
+ * so we have to copy parameter info to wrapper plan node.
+ */
+ plan->extParam = bms_copy(plan->lefttree->extParam);
+ plan->allParam = bms_copy(plan->lefttree->allParam);
+ }
+#endif
/* And convert to SubPlan or InitPlan format. */
result = build_subplan(root, plan, subroot,
@@ -1079,6 +1100,22 @@ SS_process_ctes(PlannerInfo *root)
root,
cte->cterecursive, 0.0,
&subroot);
+#ifdef XCP
+ if (subroot->distribution)
+ {
+ plan = (Plan *) make_remotesubplan(subroot,
+ plan,
+ NULL,
+ subroot->distribution,
+ subroot->query_pathkeys);
+ /*
+ * SS_finalize_plan has already been run on the subplan,
+ * so we have to copy parameter info to wrapper plan node.
+ */
+ plan->extParam = bms_copy(plan->lefttree->extParam);
+ plan->allParam = bms_copy(plan->lefttree->allParam);
+ }
+#endif
/*
* Make a SubPlan node for it. This is just enough unlike
@@ -2242,6 +2279,11 @@ finalize_plan(PlannerInfo *root, Plan *plan, Bitmapset *valid_params,
break;
#endif
+#ifdef XCP
+ case T_RemoteSubplan:
+ break;
+#endif
+
case T_Append:
{
ListCell *l;
diff --git a/src/backend/optimizer/prep/preptlist.c b/src/backend/optimizer/prep/preptlist.c
index 1af4e7fe93..e355f349f2 100644
--- a/src/backend/optimizer/prep/preptlist.c
+++ b/src/backend/optimizer/prep/preptlist.c
@@ -13,6 +13,11 @@
* between here and there is a bit arbitrary and historical.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -28,6 +33,10 @@
#include "access/sysattr.h"
#include "catalog/pg_type.h"
#include "nodes/makefuncs.h"
+#ifdef XCP
+#include "nodes/nodeFuncs.h"
+#include "optimizer/clauses.h"
+#endif
#include "optimizer/prep.h"
#include "optimizer/tlist.h"
#include "parser/parsetree.h"
@@ -75,6 +84,123 @@ preprocess_targetlist(PlannerInfo *root, List *tlist)
tlist = expand_targetlist(tlist, command_type,
result_relation, range_table);
+#ifdef XCP
+ /*
+ * If target relation is specified set distribution of the plan
+ */
+ if (result_relation)
+ {
+ Relation rel = heap_open(getrelid(result_relation, range_table),
+ NoLock);
+ RelationLocInfo *rel_loc_info = rel->rd_locator_info;
+
+ /* Is target table distributed ? */
+ if (rel_loc_info)
+ {
+ Distribution *distribution = makeNode(Distribution);
+ ListCell *lc;
+
+ distribution->distributionType = rel_loc_info->locatorType;
+ foreach(lc, rel_loc_info->nodeList)
+ distribution->nodes = bms_add_member(distribution->nodes,
+ lfirst_int(lc));
+ distribution->restrictNodes = NULL;
+ if (rel_loc_info->partAttrNum)
+ {
+ /*
+ * For INSERT and UPDATE plan tlist is matching the target table
+ * layout
+ */
+ if (command_type == CMD_INSERT || command_type == CMD_UPDATE)
+ {
+ TargetEntry *keyTle;
+ keyTle = (TargetEntry *) list_nth(tlist,
+ rel_loc_info->partAttrNum - 1);
+
+ distribution->distributionExpr = (Node *) keyTle->expr;
+
+ /*
+ * We can restrict the distribution if the expression
+ * is evaluated to a constant
+ */
+ if (command_type == CMD_INSERT)
+ {
+ Oid keytype;
+ Const *constExpr = NULL;
+
+ keytype = exprType(distribution->distributionExpr);
+ constExpr = (Const *) eval_const_expressions(root,
+ distribution->distributionExpr);
+ if (IsA(constExpr, Const) &&
+ constExpr->consttype == keytype)
+ {
+ List *nodeList = NIL;
+ Bitmapset *tmpset = bms_copy(distribution->nodes);
+ Bitmapset *restrictinfo = NULL;
+ Locator *locator;
+ int *nodenums;
+ int i, count;
+
+ while((i = bms_first_member(tmpset)) >= 0)
+ nodeList = lappend_int(nodeList, i);
+ bms_free(tmpset);
+
+ locator = createLocator(distribution->distributionType,
+ RELATION_ACCESS_INSERT,
+ keytype,
+ LOCATOR_LIST_LIST,
+ 0,
+ (void *) nodeList,
+ (void **) &nodenums,
+ false);
+ count = GET_NODES(locator, constExpr->constvalue,
+ constExpr->constisnull, NULL);
+
+ for (i = 0; i < count; i++)
+ restrictinfo = bms_add_member(restrictinfo, nodenums[i]);
+ distribution->restrictNodes = restrictinfo;
+ list_free(nodeList);
+ freeLocator(locator);
+ }
+ }
+ }
+
+ /*
+ * For delete we need to add the partitioning key of the target
+ * table to the tlist, so distribution can be correctly handled
+ * trough all the planning process.
+ */
+ if (command_type == CMD_DELETE)
+ {
+ Form_pg_attribute att_tup;
+ TargetEntry *tle;
+ Var *var;
+
+ att_tup = rel->rd_att->attrs[rel_loc_info->partAttrNum - 1];
+ var = makeVar(result_relation, rel_loc_info->partAttrNum,
+ att_tup->atttypid, att_tup->atttypmod,
+ att_tup->attcollation, 0);
+
+ tle = makeTargetEntry((Expr *) var,
+ list_length(tlist) + 1,
+ pstrdup(NameStr(att_tup->attname)),
+ true);
+ tlist = lappend(tlist, tle);
+ distribution->distributionExpr = (Node *) var;
+ }
+ }
+ else
+ distribution->distributionExpr = NULL;
+
+ root->distribution = distribution;
+ }
+ else
+ root->distribution = NULL;
+
+ heap_close(rel, NoLock);
+ }
+#endif
+
/*
* Add necessary junk columns for rowmarked rels. These values are needed
* for locking of rels selected FOR UPDATE/SHARE, and to do EvalPlanQual
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 6475633ae7..a77b86d0e8 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -17,6 +17,11 @@
* append relations, and thenceforth share code with the UNION ALL case.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -243,6 +248,16 @@ recurse_set_operations(Node *setOp, PlannerInfo *root,
root,
false, tuple_fraction,
&subroot);
+#ifdef XCP
+ if (subroot->distribution)
+ {
+ subplan = (Plan *) make_remotesubplan(subroot,
+ subplan,
+ NULL,
+ subroot->distribution,
+ subroot->query_pathkeys);
+ }
+#endif
/* Save subroot and subplan in RelOptInfo for setrefs.c */
rel->subplan = subplan;
diff --git a/src/backend/optimizer/util/Makefile b/src/backend/optimizer/util/Makefile
index 37244ad0be..3b2d16b635 100644
--- a/src/backend/optimizer/util/Makefile
+++ b/src/backend/optimizer/util/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
OBJS = clauses.o joininfo.o pathnode.o placeholder.o plancat.o predtest.o \
- relnode.o restrictinfo.o tlist.o var.o pgxcship.o
+ relnode.o restrictinfo.o tlist.o var.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index ef3a50d82f..cfda133805 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -3,6 +3,11 @@
* pathnode.c
* Routines to manipulate pathlists and create path nodes
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -26,12 +31,15 @@
#include "optimizer/tlist.h"
#include "parser/parsetree.h"
#include "utils/lsyscache.h"
-#include "utils/syscache.h"
#include "utils/selfuncs.h"
-#ifdef PGXC
-#include "commands/tablecmds.h"
-#include "optimizer/restrictinfo.h"
-#endif /* PGXC */
+#ifdef XCP
+#include "access/heapam.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "pgxc/locator.h"
+#include "pgxc/nodemgr.h"
+#include "utils/rel.h"
+#endif
typedef enum
@@ -46,7 +54,15 @@ static void add_parameterized_path(RelOptInfo *parent_rel, Path *new_path);
static List *translate_sub_tlist(List *tlist, int relid);
static bool query_is_distinct_for(Query *query, List *colnos, List *opids);
static Oid distinct_col_search(int colno, List *colnos, List *opids);
-
+#ifdef XCP
+static void restrict_distribution(PlannerInfo *root, RestrictInfo *ri,
+ Path *pathnode);
+static Path *redistribute_path(Path *subpath, char distributionType,
+ Bitmapset *nodes, Bitmapset *restrictNodes,
+ Node* distributionExpr);
+static void set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode);
+static List *set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode);
+#endif
/*****************************************************************************
* MISC. PATH UTILITIES
@@ -727,6 +743,926 @@ add_parameterized_path(RelOptInfo *parent_rel, Path *new_path)
/*****************************************************************************
* PATH NODE CREATION ROUTINES
*****************************************************************************/
+#ifdef XCP
+/*
+ * restrict_distribution
+ * Analyze the RestrictInfo and decide if it is possible to restrict
+ * distribution nodes
+ */
+static void
+restrict_distribution(PlannerInfo *root, RestrictInfo *ri,
+ Path *pathnode)
+{
+ Distribution *distribution = pathnode->distribution;
+ Oid keytype;
+ Const *constExpr = NULL;
+ bool found_key = false;
+
+ /*
+ * Can not restrict - not distributed or key is not defined
+ */
+ if (distribution == NULL ||
+ distribution->distributionExpr == NULL)
+ return;
+
+ /*
+ * We do not support OR'ed conditions yet
+ */
+ if (ri->orclause)
+ return;
+
+ keytype = exprType(distribution->distributionExpr);
+ if (ri->left_ec)
+ {
+ EquivalenceClass *ec = ri->left_ec;
+ ListCell *lc;
+ foreach(lc, ec->ec_members)
+ {
+ EquivalenceMember *em = (EquivalenceMember *) lfirst(lc);
+ if (equal(em->em_expr, distribution->distributionExpr))
+ found_key = true;
+ else if (bms_is_empty(em->em_relids))
+ {
+ Expr *cexpr = (Expr *) eval_const_expressions(root,
+ (Node *) em->em_expr);
+ if (IsA(cexpr, Const) &&
+ ((Const *) cexpr)->consttype == keytype)
+ constExpr = (Const *) cexpr;
+ }
+ }
+ }
+ if (ri->right_ec)
+ {
+ EquivalenceClass *ec = ri->right_ec;
+ ListCell *lc;
+ foreach(lc, ec->ec_members)
+ {
+ EquivalenceMember *em = (EquivalenceMember *) lfirst(lc);
+ if (equal(em->em_expr, distribution->distributionExpr))
+ found_key = true;
+ else if (bms_is_empty(em->em_relids))
+ {
+ Expr *cexpr = (Expr *) eval_const_expressions(root,
+ (Node *) em->em_expr);
+ if (IsA(cexpr, Const) &&
+ ((Const *) cexpr)->consttype == keytype)
+ constExpr = (Const *) cexpr;
+ }
+ }
+ }
+ if (IsA(ri->clause, OpExpr))
+ {
+ OpExpr *opexpr = (OpExpr *) ri->clause;
+ if (opexpr->args->length == 2 &&
+ op_mergejoinable(opexpr->opno, exprType(linitial(opexpr->args))))
+ {
+ Expr *arg1 = (Expr *) linitial(opexpr->args);
+ Expr *arg2 = (Expr *) lsecond(opexpr->args);
+ Expr *other = NULL;
+ if (equal(arg1, distribution->distributionExpr))
+ other = arg2;
+ else if (equal(arg2, distribution->distributionExpr))
+ other = arg1;
+ if (other)
+ {
+ found_key = true;
+ other = (Expr *) eval_const_expressions(root, (Node *) other);
+ if (IsA(other, Const) &&
+ ((Const *) other)->consttype == keytype)
+ constExpr = (Const *) other;
+ }
+ }
+ }
+ if (found_key && constExpr)
+ {
+ List *nodeList = NIL;
+ Bitmapset *tmpset = bms_copy(distribution->nodes);
+ Bitmapset *restrictinfo = NULL;
+ Locator *locator;
+ int *nodenums;
+ int i, count;
+
+ while((i = bms_first_member(tmpset)) >= 0)
+ nodeList = lappend_int(nodeList, i);
+ bms_free(tmpset);
+
+ locator = createLocator(distribution->distributionType,
+ RELATION_ACCESS_READ,
+ keytype,
+ LOCATOR_LIST_LIST,
+ 0,
+ (void *) nodeList,
+ (void **) &nodenums,
+ false);
+ count = GET_NODES(locator, constExpr->constvalue,
+ constExpr->constisnull, NULL);
+
+ for (i = 0; i < count; i++)
+ restrictinfo = bms_add_member(restrictinfo, nodenums[i]);
+ if (distribution->restrictNodes)
+ distribution->restrictNodes = bms_intersect(distribution->restrictNodes,
+ restrictinfo);
+ else
+ distribution->restrictNodes = restrictinfo;
+ list_free(nodeList);
+ freeLocator(locator);
+ }
+}
+
+/*
+ * set_scanpath_distribution
+ * Assign distribution to the path which is a base relation scan.
+ */
+static void
+set_scanpath_distribution(PlannerInfo *root, RelOptInfo *rel, Path *pathnode)
+{
+ RangeTblEntry *rte;
+ RelationLocInfo *rel_loc_info;
+
+ rte = planner_rt_fetch(rel->relid, root);
+ rel_loc_info = GetRelationLocInfo(rte->relid);
+ if (rel_loc_info)
+ {
+ ListCell *lc;
+ Distribution *distribution = makeNode(Distribution);
+ distribution->distributionType = rel_loc_info->locatorType;
+ foreach(lc, rel_loc_info->nodeList)
+ distribution->nodes = bms_add_member(distribution->nodes,
+ lfirst_int(lc));
+ distribution->restrictNodes = NULL;
+ /*
+ * Distribution expression of the base relation is Var representing
+ * respective attribute.
+ */
+ distribution->distributionExpr = NULL;
+ if (rel_loc_info->partAttrNum)
+ {
+ Var *var = NULL;
+ ListCell *lc;
+
+ /* Look if the Var is already in the target list */
+ foreach (lc, rel->reltargetlist)
+ {
+ var = (Var *) lfirst(lc);
+ if (IsA(var, Var) && var->varno == rel->relid &&
+ var->varattno == rel_loc_info->partAttrNum)
+ break;
+ }
+ /* If not found we should look up the attribute and make the Var */
+ if (!lc)
+ {
+ Relation relation = heap_open(rte->relid, NoLock);
+ TupleDesc tdesc = RelationGetDescr(relation);
+ Form_pg_attribute att_tup;
+
+ att_tup = tdesc->attrs[rel_loc_info->partAttrNum - 1];
+ var = makeVar(rel->relid, rel_loc_info->partAttrNum,
+ att_tup->atttypid, att_tup->atttypmod,
+ att_tup->attcollation, 0);
+
+
+ heap_close(relation, NoLock);
+ }
+
+ distribution->distributionExpr = (Node *) var;
+ }
+ pathnode->distribution = distribution;
+ }
+}
+
+
+/*
+ * Set a RemoteSubPath on top of the specified node and set specified
+ * distribution to it
+ */
+static Path *
+redistribute_path(Path *subpath, char distributionType,
+ Bitmapset *nodes, Bitmapset *restrictNodes,
+ Node* distributionExpr)
+{
+ Distribution *distribution = NULL;
+ RelOptInfo *rel = subpath->parent;
+ RemoteSubPath *pathnode;
+
+ if (distributionType != LOCATOR_TYPE_NONE)
+ {
+ distribution = makeNode(Distribution);
+ distribution->distributionType = distributionType;
+ distribution->nodes = nodes;
+ distribution->restrictNodes = restrictNodes;
+ distribution->distributionExpr = distributionExpr;
+ }
+
+ /*
+ * If inner path node is a MaterialPath pull it up to store tuples on
+ * the destination nodes and avoid sending them over the network.
+ */
+ if (IsA(subpath, MaterialPath))
+ {
+ MaterialPath *mpath = (MaterialPath *) subpath;
+ /* If subpath is already a RemoteSubPath, just replace distribution */
+ if (IsA(mpath->subpath, RemoteSubPath))
+ {
+ pathnode = (RemoteSubPath *) mpath->subpath;
+ }
+ else
+ {
+ pathnode = makeNode(RemoteSubPath);
+ pathnode->path.pathtype = T_RemoteSubplan;
+ pathnode->path.parent = rel;
+ pathnode->path.param_info = subpath->param_info;
+ pathnode->path.pathkeys = subpath->pathkeys;
+ pathnode->subpath = mpath->subpath;
+ mpath->subpath = (Path *) pathnode;
+ }
+ subpath = pathnode->subpath;
+ pathnode->path.distribution = distribution;
+ mpath->path.distribution = (Distribution *) copyObject(distribution);
+ /* (re)calculate costs */
+ cost_remote_subplan((Path *) pathnode, subpath->startup_cost,
+ subpath->total_cost, subpath->rows, rel->width,
+ IsLocatorReplicated(distributionType) ?
+ bms_num_members(nodes) : 1);
+ mpath->subpath = (Path *) pathnode;
+ cost_material(&mpath->path,
+ pathnode->path.startup_cost,
+ pathnode->path.total_cost,
+ pathnode->path.rows,
+ rel->width);
+ return (Path *) mpath;
+ }
+ else
+ {
+ pathnode = makeNode(RemoteSubPath);
+ pathnode->path.pathtype = T_RemoteSubplan;
+ pathnode->path.parent = rel;
+ pathnode->path.param_info = subpath->param_info;
+ pathnode->path.pathkeys = subpath->pathkeys;
+ pathnode->subpath = subpath;
+ pathnode->path.distribution = distribution;
+ cost_remote_subplan((Path *) pathnode, subpath->startup_cost,
+ subpath->total_cost, subpath->rows, rel->width,
+ IsLocatorReplicated(distributionType) ?
+ bms_num_members(nodes) : 1);
+ return (Path *) pathnode;
+ }
+}
+
+
+static JoinPath *
+flatCopyJoinPath(JoinPath *pathnode)
+{
+ JoinPath *newnode;
+ size_t size = 0;
+ switch(nodeTag(pathnode))
+ {
+ case T_NestPath:
+ size = sizeof(NestPath);
+ break;
+ case T_MergePath:
+ size = sizeof(MergePath);
+ break;
+ case T_HashPath:
+ size = sizeof(HashPath);
+ break;
+ default:
+ elog(ERROR, "unrecognized node type: %d", (int) nodeTag(pathnode));
+ break;
+ }
+ newnode = (JoinPath *) palloc(size);
+ memcpy(newnode, pathnode, size);
+ return newnode;
+}
+
+
+/*
+ * Analyze join parameters and set distribution of the join node.
+ * If there are possible alternate distributions the respective pathes are
+ * returned as a list so caller can cost all of them and choose cheapest to
+ * continue.
+ */
+static List *
+set_joinpath_distribution(PlannerInfo *root, JoinPath *pathnode)
+{
+ Distribution *innerd = pathnode->innerjoinpath->distribution;
+ Distribution *outerd = pathnode->outerjoinpath->distribution;
+ Distribution *targetd;
+ List *alternate = NIL;
+
+ /* Catalog join */
+ if (innerd == NULL && outerd == NULL)
+ return NIL;
+
+ /*
+ * If both subpaths are distributed by replication, the resulting
+ * distribution will be replicated on smallest common set of nodes.
+ * Catalog tables are the same on all nodes, so treat them as replicated
+ * on all nodes.
+ */
+ if ((!innerd || IsLocatorReplicated(innerd->distributionType)) &&
+ (!outerd || IsLocatorReplicated(outerd->distributionType)))
+ {
+ /* Determine common nodes */
+ Bitmapset *common;
+
+ if (innerd == NULL)
+ common = bms_copy(outerd->nodes);
+ else if (outerd == NULL)
+ common = bms_copy(innerd->nodes);
+ else
+ common = bms_intersect(innerd->nodes, outerd->nodes);
+ if (bms_is_empty(common))
+ goto not_allowed_join;
+
+ /*
+ * Join result is replicated on common nodes. Running query on any
+ * of them produce correct result.
+ */
+ targetd = makeNode(Distribution);
+ targetd->distributionType = LOCATOR_TYPE_REPLICATED;
+ targetd->nodes = common;
+ targetd->restrictNodes = NULL;
+ pathnode->path.distribution = targetd;
+ return alternate;
+ }
+
+ /*
+ * Check if we have inner replicated
+ * The "both replicated" case is already checked, so if innerd
+ * is replicated, then outerd is not replicated and it is not NULL.
+ * This case is not acceptable for some join types. If outer relation is
+ * nullable data nodes will produce joined rows with NULLs for cases when
+ * matching row exists, but on other data node.
+ */
+ if ((!innerd || IsLocatorReplicated(innerd->distributionType)) &&
+ (pathnode->jointype == JOIN_INNER ||
+ pathnode->jointype == JOIN_LEFT ||
+ pathnode->jointype == JOIN_SEMI ||
+ pathnode->jointype == JOIN_ANTI))
+ {
+ /* We need inner relation is defined on all nodes where outer is */
+ if (innerd && !bms_is_subset(outerd->nodes, innerd->nodes))
+ goto not_allowed_join;
+
+ targetd = makeNode(Distribution);
+ targetd->distributionType = outerd->distributionType;
+ targetd->nodes = bms_copy(outerd->nodes);
+ targetd->restrictNodes = bms_copy(outerd->restrictNodes);
+ targetd->distributionExpr = outerd->distributionExpr;
+ pathnode->path.distribution = targetd;
+ return alternate;
+ }
+
+
+ /*
+ * Check if we have outer replicated
+ * The "both replicated" case is already checked, so if outerd
+ * is replicated, then innerd is not replicated and it is not NULL.
+ * This case is not acceptable for some join types. If inner relation is
+ * nullable data nodes will produce joined rows with NULLs for cases when
+ * matching row exists, but on other data node.
+ */
+ if ((!outerd || IsLocatorReplicated(outerd->distributionType)) &&
+ (pathnode->jointype == JOIN_INNER ||
+ pathnode->jointype == JOIN_RIGHT))
+ {
+ /* We need outer relation is defined on all nodes where inner is */
+ if (outerd && !bms_is_subset(innerd->nodes, outerd->nodes))
+ goto not_allowed_join;
+
+ targetd = makeNode(Distribution);
+ targetd->distributionType = innerd->distributionType;
+ targetd->nodes = bms_copy(innerd->nodes);
+ targetd->restrictNodes = bms_copy(innerd->restrictNodes);
+ targetd->distributionExpr = innerd->distributionExpr;
+ pathnode->path.distribution = targetd;
+ return alternate;
+ }
+
+
+ /*
+ * This join is still allowed if inner and outer paths have
+ * equivalent distribution and joined along the distribution keys.
+ */
+ if (innerd && outerd &&
+ innerd->distributionType == outerd->distributionType &&
+ innerd->distributionExpr &&
+ outerd->distributionExpr &&
+ bms_equal(innerd->nodes, outerd->nodes))
+ {
+ ListCell *lc;
+
+ /*
+ * Make sure distribution functions are the same, for now they depend
+ * on data type
+ */
+ if (exprType((Node *) innerd->distributionExpr) != exprType((Node *) outerd->distributionExpr))
+ goto not_allowed_join;
+
+ /*
+ * Planner already did necessary work and if there is a join
+ * condition like left.key=right.key the key expressions
+ * will be members of the same equivalence class, and both
+ * sides of the corresponding RestrictInfo will refer that
+ * Equivalence Class.
+ * Try to figure out if such restriction exists.
+ */
+ foreach(lc, pathnode->joinrestrictinfo)
+ {
+ RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
+ ListCell *emc;
+ bool found_outer, found_inner;
+
+ /*
+ * Restriction operator is not equality operator ?
+ */
+ if (ri->left_ec == NULL || ri->right_ec == NULL)
+ continue;
+
+ /*
+ * A restriction with OR may be compatible if all OR'ed
+ * conditions are compatible. For the moment we do not
+ * check this and skip restriction. The case if multiple
+ * OR'ed conditions are compatible is rare and probably
+ * do not worth doing at all.
+ */
+ if (ri->orclause)
+ continue;
+
+ found_outer = false;
+ found_inner = false;
+
+ /*
+ * If parts belong to the same equivalence member check
+ * if both distribution keys are members of the class.
+ */
+ if (ri->left_ec == ri->right_ec)
+ {
+ foreach(emc, ri->left_ec->ec_members)
+ {
+ EquivalenceMember *em = (EquivalenceMember *) lfirst(emc);
+ Expr *var = (Expr *)em->em_expr;
+ if (!found_outer)
+ found_outer = equal(var, outerd->distributionExpr);
+
+ if (!found_inner)
+ found_inner = equal(var, innerd->distributionExpr);
+ }
+ if (found_outer && found_inner)
+ {
+ ListCell *tlc, *emc;
+
+ targetd = makeNode(Distribution);
+ targetd->distributionType = innerd->distributionType;
+ targetd->nodes = bms_copy(innerd->nodes);
+ targetd->restrictNodes = bms_copy(innerd->restrictNodes);
+ targetd->distributionExpr = NULL;
+ pathnode->path.distribution = targetd;
+
+ /*
+ * Each member of the equivalence class may be a
+ * distribution expression, but we prefer some from the
+ * target list.
+ */
+ foreach(tlc, pathnode->path.parent->reltargetlist)
+ {
+ Expr *var = (Expr *) lfirst(tlc);
+ foreach(emc, ri->left_ec->ec_members)
+ {
+ EquivalenceMember *em;
+ Expr *emvar;
+
+ em = (EquivalenceMember *) lfirst(emc);
+ emvar = (Expr *)em->em_expr;
+ if (equal(var, emvar))
+ {
+ targetd->distributionExpr = (Node *) var;
+ return alternate;
+ }
+ }
+ }
+ /* Not found, take any */
+ targetd->distributionExpr = innerd->distributionExpr;
+ return alternate;
+ }
+ }
+ /*
+ * Check clause, if both arguments are distribution keys and
+ * operator is an equality operator
+ */
+ else
+ {
+ OpExpr *op_exp;
+ Expr *arg1,
+ *arg2;
+
+ op_exp = (OpExpr *) ri->clause;
+ if (!IsA(op_exp, OpExpr) || list_length(op_exp->args) != 2)
+ continue;
+
+ arg1 = (Expr *) linitial(op_exp->args);
+ arg2 = (Expr *) lsecond(op_exp->args);
+
+ found_outer = equal(arg1, outerd->distributionExpr) || equal(arg2, outerd->distributionExpr);
+ found_inner = equal(arg1, innerd->distributionExpr) || equal(arg2, innerd->distributionExpr);
+
+ if (found_outer && found_inner)
+ {
+ targetd = makeNode(Distribution);
+ targetd->distributionType = innerd->distributionType;
+ targetd->nodes = bms_copy(innerd->nodes);
+ targetd->restrictNodes = bms_copy(innerd->restrictNodes);
+ pathnode->path.distribution = targetd;
+
+ /*
+ * In case of outer join distribution key should not refer
+ * distribution key of nullable part.
+ */
+ if (pathnode->jointype == JOIN_FULL)
+ /* both parts are nullable */
+ targetd->distributionExpr = NULL;
+ else if (pathnode->jointype == JOIN_RIGHT)
+ targetd->distributionExpr = innerd->distributionExpr;
+ else
+ targetd->distributionExpr = outerd->distributionExpr;
+
+ return alternate;
+ }
+ }
+ }
+ }
+
+ /*
+ * If we could not determine the distribution redistribute the subpathes.
+ */
+not_allowed_join:
+ /*
+ * If redistribution is required, sometimes the cheapest path would be if
+ * one of the subplan is replicated. If replication of any or all subplans
+ * is possible, return resulting plans as alternates. Try to distribute all
+ * by has as main variant.
+ */
+
+ /* These join types allow replicated inner */
+ if (outerd &&
+ (pathnode->jointype == JOIN_INNER ||
+ pathnode->jointype == JOIN_LEFT ||
+ pathnode->jointype == JOIN_SEMI ||
+ pathnode->jointype == JOIN_ANTI))
+ {
+ /*
+ * Since we discard all alternate pathes except one it is OK if all they
+ * reference the same objects
+ */
+ JoinPath *altpath = flatCopyJoinPath(pathnode);
+ /* Redistribute inner subquery */
+ altpath->innerjoinpath = redistribute_path(
+ altpath->innerjoinpath,
+ LOCATOR_TYPE_REPLICATED,
+ bms_copy(outerd->nodes),
+ bms_copy(outerd->restrictNodes),
+ NULL);
+ targetd = makeNode(Distribution);
+ targetd->distributionType = outerd->distributionType;
+ targetd->nodes = bms_copy(outerd->nodes);
+ targetd->restrictNodes = bms_copy(outerd->restrictNodes);
+ targetd->distributionExpr = outerd->distributionExpr;
+ altpath->path.distribution = targetd;
+ alternate = lappend(alternate, altpath);
+ }
+
+ /* These join types allow replicated outer */
+ if (innerd &&
+ (pathnode->jointype == JOIN_INNER ||
+ pathnode->jointype == JOIN_RIGHT))
+ {
+ /*
+ * Since we discard all alternate pathes except one it is OK if all they
+ * reference the same objects
+ */
+ JoinPath *altpath = flatCopyJoinPath(pathnode);
+ /* Redistribute inner subquery */
+ altpath->outerjoinpath = redistribute_path(
+ altpath->outerjoinpath,
+ LOCATOR_TYPE_REPLICATED,
+ bms_copy(innerd->nodes),
+ bms_copy(innerd->restrictNodes),
+ NULL);
+ targetd = makeNode(Distribution);
+ targetd->distributionType = innerd->distributionType;
+ targetd->nodes = bms_copy(innerd->nodes);
+ targetd->restrictNodes = bms_copy(innerd->restrictNodes);
+ targetd->distributionExpr = innerd->distributionExpr;
+ altpath->path.distribution = targetd;
+ alternate = lappend(alternate, altpath);
+ }
+
+ /*
+ * Redistribute subplans to make them compatible.
+ * If any of the subplans is a coordinator subplan skip this stuff and do
+ * coordinator join.
+ */
+ if (innerd && outerd)
+ {
+ RestrictInfo *preferred = NULL;
+ Expr *new_inner_key = NULL;
+ Expr *new_outer_key = NULL;
+ char distType = LOCATOR_TYPE_NONE;
+ ListCell *lc;
+
+ /*
+ * Look through the join restrictions to find one that is a hashable
+ * operator on two arguments. Choose best restriction acoording to
+ * following criteria:
+ * 1. one argument is already a partitioning key of one subplan.
+ * 2. restriction is cheaper to calculate
+ */
+ foreach(lc, pathnode->joinrestrictinfo)
+ {
+ RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
+
+ /* can not handle ORed conditions */
+ if (ri->orclause)
+ continue;
+
+ if (IsA(ri->clause, OpExpr))
+ {
+ OpExpr *expr = (OpExpr *) ri->clause;
+ if (list_length(expr->args) == 2 &&
+ op_hashjoinable(expr->opno, exprType(linitial(expr->args))))
+ {
+ Expr *left = (Expr *) linitial(expr->args);
+ Expr *right = (Expr *) lsecond(expr->args);
+ Oid leftType = exprType((Node *) left);
+ Oid rightType = exprType((Node *) right);
+ Relids inner_rels = pathnode->innerjoinpath->parent->relids;
+ Relids outer_rels = pathnode->outerjoinpath->parent->relids;
+ QualCost cost;
+
+ /*
+ * Check if both parts are of the same data type and choose
+ * distribution type to redistribute.
+ * XXX We may want more sophisticated algorithm to choose
+ * the best condition to redistribute parts along.
+ * For now use simple but reliable approach.
+ */
+ if (leftType != rightType)
+ continue;
+ /*
+ * Evaluation cost will be needed to choose preferred
+ * distribution
+ */
+ cost_qual_eval_node(&cost, (Node *) ri, root);
+
+ if (outerd->distributionExpr)
+ {
+ /*
+ * If left side is distribution key of outer subquery
+ * and right expression refers only inner subquery
+ */
+ if (equal(outerd->distributionExpr, left) &&
+ bms_is_subset(ri->right_relids, inner_rels))
+ {
+ if (!preferred || /* no preferred restriction yet found */
+ (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
+ (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
+ {
+ /* set new preferred restriction */
+ preferred = ri;
+ new_inner_key = right;
+ new_outer_key = NULL; /* no need to change */
+ distType = outerd->distributionType;
+ }
+ continue;
+ }
+ /*
+ * If right side is distribution key of outer subquery
+ * and left expression refers only inner subquery
+ */
+ if (equal(outerd->distributionExpr, right) &&
+ bms_is_subset(ri->left_relids, inner_rels))
+ {
+ if (!preferred || /* no preferred restriction yet found */
+ (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
+ (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
+ {
+ /* set new preferred restriction */
+ preferred = ri;
+ new_inner_key = left;
+ new_outer_key = NULL; /* no need to change */
+ distType = outerd->distributionType;
+ }
+ continue;
+ }
+ }
+ if (innerd->distributionExpr)
+ {
+ /*
+ * If left side is distribution key of inner subquery
+ * and right expression refers only outer subquery
+ */
+ if (equal(innerd->distributionExpr, left) &&
+ bms_is_subset(ri->right_relids, outer_rels))
+ {
+ if (!preferred || /* no preferred restriction yet found */
+ (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
+ (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
+ {
+ /* set new preferred restriction */
+ preferred = ri;
+ new_inner_key = NULL; /* no need to change */
+ new_outer_key = right;
+ distType = innerd->distributionType;
+ }
+ continue;
+ }
+ /*
+ * If right side is distribution key of inner subquery
+ * and left expression refers only outer subquery
+ */
+ if (equal(innerd->distributionExpr, right) &&
+ bms_is_subset(ri->left_relids, outer_rels))
+ {
+ if (!preferred || /* no preferred restriction yet found */
+ (new_inner_key && new_outer_key) || /* preferred restriction require redistribution of both parts */
+ (cost.per_tuple < preferred->eval_cost.per_tuple)) /* current restriction is cheaper */
+ {
+ /* set new preferred restriction */
+ preferred = ri;
+ new_inner_key = NULL; /* no need to change */
+ new_outer_key = left;
+ distType = innerd->distributionType;
+ }
+ continue;
+ }
+ }
+ /*
+ * Current restriction recuire redistribution of both parts.
+ * If preferred restriction require redistribution of one,
+ * keep it.
+ */
+ if (preferred &&
+ (new_inner_key == NULL || new_outer_key == NULL))
+ continue;
+
+ /*
+ * Skip this condition if the data type of the expressions
+ * does not allow either HASH or MODULO distribution.
+ * HASH distribution is preferrable.
+ */
+ if (IsTypeHashDistributable(leftType))
+ distType = LOCATOR_TYPE_HASH;
+ else if (IsTypeModuloDistributable(leftType))
+ distType = LOCATOR_TYPE_MODULO;
+ else
+ continue;
+ /*
+ * If this restriction the first or easier to calculate
+ * then preferred, try to store it as new preferred
+ * restriction to redistribute along it.
+ */
+ if (preferred == NULL ||
+ (cost.per_tuple < preferred->eval_cost.per_tuple))
+ {
+ /*
+ * Left expression depends only on outer subpath and
+ * right expression depends only on inner subpath, so
+ * we can redistribute both and make left expression the
+ * distribution key of outer subplan and right
+ * expression the distribution key of inner subplan
+ */
+ if (bms_is_subset(ri->left_relids, outer_rels) &&
+ bms_is_subset(ri->right_relids, inner_rels))
+ {
+ preferred = ri;
+ new_outer_key = left;
+ new_inner_key = right;
+ }
+ /*
+ * Left expression depends only on inner subpath and
+ * right expression depends only on outer subpath, so
+ * we can redistribute both and make left expression the
+ * distribution key of inner subplan and right
+ * expression the distribution key of outer subplan
+ */
+ if (bms_is_subset(ri->left_relids, inner_rels) &&
+ bms_is_subset(ri->right_relids, outer_rels))
+ {
+ preferred = ri;
+ new_inner_key = left;
+ new_outer_key = right;
+ }
+ }
+ }
+ }
+ }
+ /* If we have suitable restriction we can repartition accordingly */
+ if (preferred)
+ {
+ Bitmapset *nodes = NULL;
+ Bitmapset *restrictNodes = NULL;
+
+ /* If we redistribute both parts do join on all nodes ... */
+ if (new_inner_key && new_outer_key)
+ {
+ int i;
+ for (i = 0; i < NumDataNodes; i++)
+ nodes = bms_add_member(nodes, i);
+ }
+ /*
+ * ... if we do only one of them redistribute it on the same nodes
+ * as other.
+ */
+ else if (new_inner_key)
+ {
+ nodes = bms_copy(outerd->nodes);
+ restrictNodes = bms_copy(outerd->restrictNodes);
+ }
+ else /*if (new_outer_key)*/
+ {
+ nodes = bms_copy(innerd->nodes);
+ restrictNodes = bms_copy(innerd->restrictNodes);
+ }
+
+ /*
+ * Redistribute join by hash, and, if jointype allows, create
+ * alternate path where inner subplan is distributed by replication
+ */
+ if (new_inner_key)
+ {
+ /* Redistribute inner subquery */
+ pathnode->innerjoinpath = redistribute_path(
+ pathnode->innerjoinpath,
+ distType,
+ nodes,
+ restrictNodes,
+ (Node *) new_inner_key);
+ }
+ /*
+ * Redistribute join by hash, and, if jointype allows, create
+ * alternate path where outer subplan is distributed by replication
+ */
+ if (new_outer_key)
+ {
+ /* Redistribute outer subquery */
+ pathnode->outerjoinpath = redistribute_path(
+ pathnode->outerjoinpath,
+ distType,
+ nodes,
+ restrictNodes,
+ (Node *) new_outer_key);
+ }
+ targetd = makeNode(Distribution);
+ targetd->distributionType = distType;
+ targetd->nodes = nodes;
+ targetd->restrictNodes = NULL;
+ pathnode->path.distribution = targetd;
+ /*
+ * In case of outer join distribution key should not refer
+ * distribution key of nullable part.
+ * NB: we should not refer innerd and outerd here, subpathes are
+ * redistributed already
+ */
+ if (pathnode->jointype == JOIN_FULL)
+ /* both parts are nullable */
+ targetd->distributionExpr = NULL;
+ else if (pathnode->jointype == JOIN_RIGHT)
+ targetd->distributionExpr =
+ pathnode->innerjoinpath->distribution->distributionExpr;
+ else
+ targetd->distributionExpr =
+ pathnode->outerjoinpath->distribution->distributionExpr;
+
+ return alternate;
+ }
+ }
+
+ /*
+ * Build cartesian product, if no hasheable restrictions is found.
+ * Perform coordinator join in such cases. If this join would be a part of
+ * larger join, it will be handled as replicated.
+ * To do that leave join distribution NULL and place a RemoteSubPath node on
+ * top of each subpath to provide access to joined result sets.
+ * Do not redistribute pathes that already have NULL distribution, this is
+ * possible if performing outer join on a coordinator and a datanode
+ * relations.
+ */
+ if (innerd)
+ pathnode->innerjoinpath = redistribute_path(pathnode->innerjoinpath,
+ LOCATOR_TYPE_NONE,
+ NULL,
+ NULL,
+ NULL);
+ if (outerd)
+ pathnode->outerjoinpath = redistribute_path(pathnode->outerjoinpath,
+ LOCATOR_TYPE_NONE,
+ NULL,
+ NULL,
+ NULL);
+ return alternate;
+}
+#endif
+
/*
* create_seqscan_path
@@ -744,6 +1680,19 @@ create_seqscan_path(PlannerInfo *root, RelOptInfo *rel, Relids required_outer)
required_outer);
pathnode->pathkeys = NIL; /* seqscan has unordered result */
+#ifdef XCP
+ set_scanpath_distribution(root, rel, pathnode);
+ if (rel->baserestrictinfo)
+ {
+ ListCell *lc;
+ foreach (lc, rel->baserestrictinfo)
+ {
+ RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
+ restrict_distribution(root, ri, pathnode);
+ }
+ }
+#endif
+
cost_seqscan(pathnode, root, rel, pathnode->param_info);
return pathnode;
@@ -810,6 +1759,18 @@ create_index_path(PlannerInfo *root,
pathnode->indexorderbycols = indexorderbycols;
pathnode->indexscandir = indexscandir;
+#ifdef XCP
+ set_scanpath_distribution(root, rel, (Path *) pathnode);
+ if (indexclauses)
+ {
+ ListCell *lc;
+ foreach (lc, indexclauses)
+ {
+ RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
+ restrict_distribution(root, ri, (Path *) pathnode);
+ }
+ }
+#endif
cost_index(pathnode, root, loop_count);
return pathnode;
@@ -844,6 +1805,19 @@ create_bitmap_heap_path(PlannerInfo *root,
pathnode->bitmapqual = bitmapqual;
+#ifdef XCP
+ set_scanpath_distribution(root, rel, (Path *) pathnode);
+ if (rel->baserestrictinfo)
+ {
+ ListCell *lc;
+ foreach (lc, rel->baserestrictinfo)
+ {
+ RestrictInfo *ri = (RestrictInfo *) lfirst(lc);
+ restrict_distribution(root, ri, (Path *) pathnode);
+ }
+ }
+#endif
+
cost_bitmap_heap_scan(&pathnode->path, root, rel,
pathnode->path.param_info,
bitmapqual, loop_count);
@@ -869,6 +1843,10 @@ create_bitmap_and_path(PlannerInfo *root,
pathnode->bitmapquals = bitmapquals;
+#ifdef XCP
+ set_scanpath_distribution(root, rel, (Path *) pathnode);
+#endif
+
/* this sets bitmapselectivity as well as the regular cost fields: */
cost_bitmap_and_node(pathnode, root);
@@ -893,6 +1871,10 @@ create_bitmap_or_path(PlannerInfo *root,
pathnode->bitmapquals = bitmapquals;
+#ifdef XCP
+ set_scanpath_distribution(root, rel, (Path *) pathnode);
+#endif
+
/* this sets bitmapselectivity as well as the regular cost fields: */
cost_bitmap_or_node(pathnode, root);
@@ -915,6 +1897,13 @@ create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals)
pathnode->tidquals = tidquals;
+#ifdef XCP
+ set_scanpath_distribution(root, rel, (Path *) pathnode);
+ /* We may need to pass info about target node to support */
+ if (pathnode->path.distribution)
+ elog(ERROR, "could not perform TID scan on remote relation");
+#endif
+
cost_tidscan(&pathnode->path, root, rel, tidquals);
return pathnode;
@@ -932,6 +1921,10 @@ create_append_path(RelOptInfo *rel, List *subpaths, Relids required_outer)
{
AppendPath *pathnode = makeNode(AppendPath);
ListCell *l;
+#ifdef XCP
+ Distribution *distribution;
+ Path *subpath;
+#endif
pathnode->path.pathtype = T_Append;
pathnode->path.parent = rel;
@@ -939,6 +1932,65 @@ create_append_path(RelOptInfo *rel, List *subpaths, Relids required_outer)
required_outer);
pathnode->path.pathkeys = NIL; /* result is always considered
* unsorted */
+#ifdef XCP
+ /*
+ * Append path is used to implement scans of inherited tables and some
+ * "set" operations, like UNION ALL. While all inherited tables should
+ * have the same distribution, UNION'ed queries may have different.
+ * When paths being appended have the same distribution it is OK to push
+ * Append down to the data nodes. If not, perform "coordinator" Append.
+ */
+
+ /* Special case of the dummy relation, if the subpaths list is empty */
+ if (subpaths)
+ {
+ /* Take distribution of the first node */
+ l = list_head(subpaths);
+ subpath = (Path *) lfirst(l);
+ distribution = copyObject(subpath->distribution);
+ /*
+ * Check remaining subpaths, if all distributions equal to the first set
+ * it as a distribution of the Append path; otherwise make up coordinator
+ * Append
+ */
+ while ((l = lnext(l)))
+ {
+ subpath = (Path *) lfirst(l);
+
+ if (equal(distribution, subpath->distribution))
+ {
+ /*
+ * Both distribution and subpath->distribution may be NULL at
+ * this point, or they both are not null.
+ */
+ if (distribution && subpath->distribution->restrictNodes)
+ distribution->restrictNodes = bms_union(
+ distribution->restrictNodes,
+ subpath->distribution->restrictNodes);
+ }
+ else
+ {
+ break;
+ }
+ }
+ if (l)
+ {
+ List *newsubpaths = NIL;
+ foreach(l, subpaths)
+ {
+ subpath = (Path *) lfirst(l);
+ if (subpath->distribution)
+ subpath = redistribute_path(subpath, LOCATOR_TYPE_NONE,
+ NULL, NULL, NULL);
+ newsubpaths = lappend(newsubpaths, subpath);
+ }
+ subpaths = newsubpaths;
+ pathnode->path.distribution = NULL;
+ }
+ else
+ pathnode->path.distribution = distribution;
+ }
+#endif
pathnode->subpaths = subpaths;
/*
@@ -985,9 +2037,70 @@ create_merge_append_path(PlannerInfo *root,
Cost input_startup_cost;
Cost input_total_cost;
ListCell *l;
+#ifdef XCP
+ Distribution *distribution = NULL;
+ Path *subpath;
+#endif
pathnode->path.pathtype = T_MergeAppend;
pathnode->path.parent = rel;
+#ifdef XCP
+ /*
+ * It is safe to push down MergeAppend if all subpath distributions
+ * are the same and these distributions are Replicated or distribution key
+ * is the expression of the first pathkey.
+ */
+ /* Take distribution of the first node */
+ l = list_head(subpaths);
+ subpath = (Path *) lfirst(l);
+ distribution = copyObject(subpath->distribution);
+ /*
+ * Verify if it is safe to push down MergeAppend with this distribution.
+ * TODO implement check of the second condition (distribution key is the
+ * first pathkey)
+ */
+ if (distribution == NULL || IsLocatorReplicated(distribution->distributionType))
+ {
+ /*
+ * Check remaining subpaths, if all distributions equal to the first set
+ * it as a distribution of the Append path; otherwise make up coordinator
+ * Append
+ */
+ while ((l = lnext(l)))
+ {
+ subpath = (Path *) lfirst(l);
+
+ if (distribution && equal(distribution, subpath->distribution))
+ {
+ if (subpath->distribution->restrictNodes)
+ distribution->restrictNodes = bms_union(
+ distribution->restrictNodes,
+ subpath->distribution->restrictNodes);
+ }
+ else
+ {
+ break;
+ }
+ }
+ }
+ if (l)
+ {
+ List *newsubpaths = NIL;
+ foreach(l, subpaths)
+ {
+ subpath = (Path *) lfirst(l);
+ if (subpath->distribution)
+ subpath = redistribute_path(subpath, LOCATOR_TYPE_NONE,
+ NULL, NULL, NULL);
+ newsubpaths = lappend(newsubpaths, subpath);
+ }
+ subpaths = newsubpaths;
+ pathnode->path.distribution = NULL;
+ }
+ else
+ pathnode->path.distribution = distribution;
+#endif
+
pathnode->path.param_info = get_appendrel_parampathinfo(rel,
required_outer);
pathnode->path.pathkeys = pathkeys;
@@ -1101,6 +2214,10 @@ create_material_path(RelOptInfo *rel, Path *subpath)
pathnode->subpath = subpath;
+#ifdef XCP
+ pathnode->path.distribution = (Distribution *) copyObject(subpath->distribution);
+#endif
+
cost_material(&pathnode->path,
subpath->startup_cost,
subpath->total_cost,
@@ -1297,6 +2414,32 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
if (contain_volatile_functions((Node *) uniq_exprs))
goto no_unique_path;
+#ifdef XCP
+ /*
+ * We may only guarantee uniqueness if subplan is either replicated or it is
+ * partitioned and one of the unigue expressions equals to the
+ * distribution expression.
+ */
+ if (subpath->distribution &&
+ !IsLocatorReplicated(subpath->distribution->distributionType))
+ {
+ /* Punt if no distribution key */
+ if (subpath->distribution->distributionExpr == NULL)
+ goto no_unique_path;
+
+ foreach(lc, uniq_exprs)
+ {
+ void *expr = lfirst(lc);
+ if (equal(expr, subpath->distribution->distributionExpr))
+ break;
+ }
+
+ /* XXX we may try and repartition if no matching expression */
+ if (!lc)
+ goto no_unique_path;
+ }
+#endif
+
/*
* If we get here, we can unique-ify using at least one of sorting and
* hashing. Start building the result Path object.
@@ -1317,6 +2460,11 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
pathnode->in_operators = in_operators;
pathnode->uniq_exprs = uniq_exprs;
+#ifdef XCP
+ /* distribution is the same as in the subpath */
+ pathnode->path.distribution = (Distribution *) copyObject(subpath->distribution);
+#endif
+
/*
* If the input is a relation and it has a unique index that proves the
* uniq_exprs are unique, then we don't need to do anything. Note that
@@ -1640,8 +2788,14 @@ distinct_col_search(int colno, List *colnos, List *opids)
* returning the pathnode.
*/
Path *
+#ifdef XCP
+create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel,
+ List *pathkeys, Relids required_outer,
+ Distribution *distribution)
+#else
create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel,
List *pathkeys, Relids required_outer)
+#endif
{
Path *pathnode = makeNode(Path);
@@ -1650,6 +2804,9 @@ create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel,
pathnode->param_info = get_baserel_parampathinfo(root, rel,
required_outer);
pathnode->pathkeys = pathkeys;
+#ifdef XCP
+ pathnode->distribution = distribution;
+#endif
cost_subqueryscan(pathnode, root, rel, pathnode->param_info);
@@ -1737,6 +2894,33 @@ create_worktablescan_path(PlannerInfo *root, RelOptInfo *rel)
return pathnode;
}
+
+#ifdef PGXC
+#ifndef XCP
+/*
+ * create_remotequery_path
+ * Creates a path corresponding to a scan of a remote query,
+ * returning the pathnode.
+ */
+Path *
+create_remotequery_path(PlannerInfo *root, RelOptInfo *rel)
+{
+ Path *pathnode = makeNode(Path);
+
+ pathnode->pathtype = T_RemoteQuery;
+ pathnode->parent = rel;
+ pathnode->param_info = NULL; /* never parameterized at present */
+ pathnode->pathkeys = NIL; /* result is always unordered */
+
+ /* PGXCTODO - set cost properly */
+ cost_seqscan(pathnode, root, rel, pathnode->param_info);
+
+ return pathnode;
+}
+#endif /* XCP */
+#endif /* PGXC */
+
+
/*
* create_foreignscan_path
* Creates a path corresponding to a scan of a foreign table,
@@ -1856,6 +3040,10 @@ create_nestloop_path(PlannerInfo *root,
Relids required_outer)
{
NestPath *pathnode = makeNode(NestPath);
+#ifdef XCP
+ List *alternate;
+ ListCell *lc;
+#endif
Relids inner_req_outer = PATH_REQ_OUTER(inner_path);
/*
@@ -1900,8 +3088,24 @@ create_nestloop_path(PlannerInfo *root,
pathnode->innerjoinpath = inner_path;
pathnode->joinrestrictinfo = restrict_clauses;
+#ifdef XCP
+ alternate = set_joinpath_distribution(root, pathnode);
+#endif
final_cost_nestloop(root, pathnode, workspace, sjinfo, semifactors);
+#ifdef XCP
+ /*
+ * Also calculate costs of all alternates and return cheapest path
+ */
+ foreach(lc, alternate)
+ {
+ NestPath *altpath = (NestPath *) lfirst(lc);
+ final_cost_nestloop(root, altpath, workspace, sjinfo, semifactors);
+ if (altpath->path.total_cost < pathnode->path.total_cost)
+ pathnode = altpath;
+ }
+#endif
+
return pathnode;
}
@@ -1940,6 +3144,10 @@ create_mergejoin_path(PlannerInfo *root,
List *innersortkeys)
{
MergePath *pathnode = makeNode(MergePath);
+#ifdef XCP
+ List *alternate;
+ ListCell *lc;
+#endif
pathnode->jpath.path.pathtype = T_MergeJoin;
pathnode->jpath.path.parent = joinrel;
@@ -1959,10 +3167,25 @@ create_mergejoin_path(PlannerInfo *root,
pathnode->path_mergeclauses = mergeclauses;
pathnode->outersortkeys = outersortkeys;
pathnode->innersortkeys = innersortkeys;
+#ifdef XCP
+ alternate = set_joinpath_distribution(root, (JoinPath *) pathnode);
+#endif
/* pathnode->materialize_inner will be set by final_cost_mergejoin */
-
final_cost_mergejoin(root, pathnode, workspace, sjinfo);
+#ifdef XCP
+ /*
+ * Also calculate costs of all alternates and return cheapest path
+ */
+ foreach(lc, alternate)
+ {
+ MergePath *altpath = (MergePath *) lfirst(lc);
+ final_cost_mergejoin(root, altpath, workspace, sjinfo);
+ if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost)
+ pathnode = altpath;
+ }
+#endif
+
return pathnode;
}
@@ -1996,6 +3219,10 @@ create_hashjoin_path(PlannerInfo *root,
List *hashclauses)
{
HashPath *pathnode = makeNode(HashPath);
+#ifdef XCP
+ List *alternate;
+ ListCell *lc;
+#endif
pathnode->jpath.path.pathtype = T_HashJoin;
pathnode->jpath.path.parent = joinrel;
@@ -2025,10 +3252,25 @@ create_hashjoin_path(PlannerInfo *root,
pathnode->jpath.innerjoinpath = inner_path;
pathnode->jpath.joinrestrictinfo = restrict_clauses;
pathnode->path_hashclauses = hashclauses;
+#ifdef XCP
+ alternate = set_joinpath_distribution(root, (JoinPath *) pathnode);
+#endif
/* final_cost_hashjoin will fill in pathnode->num_batches */
-
final_cost_hashjoin(root, pathnode, workspace, sjinfo, semifactors);
+#ifdef XCP
+ /*
+ * Calculate costs of all alternates and return cheapest path
+ */
+ foreach(lc, alternate)
+ {
+ HashPath *altpath = (HashPath *) lfirst(lc);
+ final_cost_hashjoin(root, altpath, workspace, sjinfo, semifactors);
+ if (altpath->jpath.path.total_cost < pathnode->jpath.path.total_cost)
+ pathnode = altpath;
+ }
+#endif
+
return pathnode;
}
@@ -2093,8 +3335,13 @@ reparameterize_path(PlannerInfo *root, Path *path,
loop_count);
}
case T_SubqueryScan:
+#ifdef XCP
+ return create_subqueryscan_path(root, rel, path->pathkeys,
+ required_outer, path->distribution);
+#else
return create_subqueryscan_path(root, rel, path->pathkeys,
required_outer);
+#endif
default:
break;
}
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 54fe5732da..bc7e8a6096 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -4,6 +4,11 @@
* routines for accessing the system catalogs
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -338,6 +343,16 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
*/
if (info->indpred == NIL)
{
+#ifdef XCP
+ /*
+ * If parent relation is distributed the local storage manager
+ * does not have actual information about index size.
+ * We have to get relation statistics instead.
+ */
+ if (IS_PGXC_COORDINATOR && relation->rd_locator_info != NULL)
+ info->pages = indexRelation->rd_rel->relpages;
+ else
+#endif
info->pages = RelationGetNumberOfBlocks(indexRelation);
info->tuples = rel->tuples;
}
@@ -396,7 +411,8 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
{
case RELKIND_RELATION:
#ifdef PGXC
- /*
+#ifndef XCP
+ /*
* This is a remote table... we have no idea how many pages/rows
* we may get from a scan of this table. However, we should set the
* costs in such a manner that cheapest paths should pick up the
@@ -419,8 +435,21 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
break;
}
#endif
+#endif
case RELKIND_INDEX:
case RELKIND_TOASTVALUE:
+#ifdef XCP
+ if (IS_PGXC_COORDINATOR && rel->rd_locator_info != NULL)
+ {
+ /*
+ * Remote table does not store rows locally, so storage manager
+ * does not know how many pages are there, we rely on relation
+ * statistics.
+ */
+ curpages = rel->rd_rel->relpages;
+ }
+ else
+#endif
/* it has storage, ok to call the smgr */
curpages = RelationGetNumberOfBlocks(rel);
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index dce0b9330e..85feefdfce 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -14,6 +14,11 @@
* contain optimizable statements, which we should transform.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -25,6 +30,11 @@
#include "postgres.h"
#include "access/sysattr.h"
+#ifdef XCP
+#include "catalog/pg_namespace.h"
+#include "catalog/namespace.h"
+#include "utils/builtins.h"
+#endif
#ifdef PGXC
#include "catalog/pg_inherits.h"
#include "catalog/pg_inherits_fn.h"
@@ -54,7 +64,7 @@
#include "pgxc/pgxcnode.h"
#include "access/gtm.h"
#include "utils/lsyscache.h"
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
#include "tcop/tcopprot.h"
#include "nodes/nodes.h"
#include "pgxc/poolmgr.h"
@@ -90,14 +100,19 @@ static Query *transformCreateTableAsStmt(ParseState *pstate,
CreateTableAsStmt *stmt);
#ifdef PGXC
static Query *transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt);
+#ifndef XCP
static bool IsExecDirectUtilityStmt(Node *node);
static bool is_relation_child(RangeTblEntry *child_rte, List *rtable);
static bool is_rel_child_of_rel(RangeTblEntry *child_rte, RangeTblEntry *parent_rte);
#endif
+#endif
static void transformLockingClause(ParseState *pstate, Query *qry,
LockingClause *lc, bool pushedDown);
+#ifdef XCP
+static void ParseAnalyze_rtable_walk(List *rtable);
+#endif
/*
* parse_analyze
@@ -549,8 +564,10 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
ParseState *sub_pstate = make_parsestate(pstate);
Query *selectQuery;
#ifdef PGXC
+#ifndef XCP
RangeTblEntry *target_rte;
#endif
+#endif
/*
* Process the source SELECT.
@@ -584,6 +601,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
makeAlias("*SELECT*", NIL),
false);
#ifdef PGXC
+#ifndef XCP
/*
* For an INSERT SELECT involving INSERT on a child after scanning
* the parent, set flag to send command ID communication to remote
@@ -599,6 +617,7 @@ transformInsertStmt(ParseState *pstate, InsertStmt *stmt)
}
}
#endif
+#endif
rtr = makeNode(RangeTblRef);
/* assume new rte is at end */
rtr->rtindex = list_length(pstate->p_rtable);
@@ -2350,7 +2369,9 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt)
List *raw_parsetree_list;
ListCell *raw_parsetree_item;
char *nodename;
+#ifndef XCP
Oid nodeoid;
+#endif
int nodeIndex;
char nodetype;
@@ -2370,6 +2391,15 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt)
/* There is a single element here */
nodename = strVal(linitial(nodelist));
+#ifdef XCP
+ nodetype = PGXC_NODE_NONE;
+ nodeIndex = PGXCNodeGetNodeIdFromName(nodename, &nodetype);
+ if (nodetype == PGXC_NODE_NONE)
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("PGXC Node %s: object not defined",
+ nodename)));
+#else
nodeoid = get_pgxc_nodeoid(nodename);
if (!OidIsValid(nodeoid))
@@ -2381,6 +2411,7 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt)
/* Get node type and index */
nodetype = get_pgxc_nodetype(nodeoid);
nodeIndex = PGXCNodeGetNodeId(nodeoid, get_pgxc_nodetype(nodeoid));
+#endif
/* Check if node is requested is the self-node or not */
if (nodetype == PGXC_NODE_COORDINATOR && nodeIndex == PGXCNodeId - 1)
@@ -2405,13 +2436,16 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt)
result = parse_analyze(parsetree, query, NULL, 0);
}
+#ifndef XCP
/* Needed by planner */
result->sql_statement = pstrdup(query);
+#endif
/* Default list of parameters to set */
step->sql_statement = NULL;
step->exec_nodes = makeNode(ExecNodes);
step->combine_type = COMBINE_TYPE_NONE;
+ step->sort = NULL;
step->read_only = true;
step->force_autocommit = false;
step->cursor = NULL;
@@ -2423,7 +2457,17 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt)
else
step->exec_type = EXEC_ON_DATANODES;
+ step->reduce_level = 0;
step->base_tlist = NIL;
+ step->outer_alias = NULL;
+ step->inner_alias = NULL;
+ step->outer_reduce_level = 0;
+ step->inner_reduce_level = 0;
+ step->outer_relids = NULL;
+ step->inner_relids = NULL;
+ step->inner_statement = NULL;
+ step->outer_statement = NULL;
+ step->join_condition = NULL;
/* Change the list of nodes that will be executed for the query and others */
step->force_autocommit = false;
@@ -2463,14 +2507,15 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt)
}
}
+#ifndef XCP
/*
* Features not yet supported
* DML can be launched without errors but this could compromise data
* consistency, so block it.
*/
- if (!xc_maintenance_mode && (step->exec_direct_type == EXEC_DIRECT_DELETE
- || step->exec_direct_type == EXEC_DIRECT_UPDATE
- || step->exec_direct_type == EXEC_DIRECT_INSERT))
+ if (step->exec_direct_type == EXEC_DIRECT_DELETE
+ || step->exec_direct_type == EXEC_DIRECT_UPDATE
+ || step->exec_direct_type == EXEC_DIRECT_INSERT)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("EXECUTE DIRECT cannot execute DML queries")));
@@ -2488,18 +2533,22 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt)
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("EXECUTE DIRECT cannot execute locally this utility query")));
}
+#endif
/* Build Execute Node list, there is a unique node for the time being */
step->exec_nodes->nodeList = lappend_int(step->exec_nodes->nodeList, nodeIndex);
/* Associate newly-created RemoteQuery node to the returned Query result */
+#ifndef XCP
result->is_local = is_local;
+#endif
if (!is_local)
result->utilityStmt = (Node *) step;
return result;
}
+#ifndef XCP
/*
* Check if given node is authorized to go through EXECUTE DURECT
*/
@@ -2615,6 +2664,7 @@ is_rel_child_of_rel(RangeTblEntry *child_rte, RangeTblEntry *parent_rte)
}
#endif
+#endif
/*
* Check for features that are not supported together with FOR UPDATE/SHARE.
@@ -2848,3 +2898,76 @@ applyLockingClause(Query *qry, Index rtindex,
rc->pushedDown = pushedDown;
qry->rowMarks = lappend(qry->rowMarks, rc);
}
+
+#ifdef XCP
+/*
+ * Check if the query contains references to any pg_catalog tables that should
+ * be remapped to storm_catalog. The list is obtained from the
+ * storm_catalog_remap_string GUC. Also do this only for normal users
+ */
+void
+ParseAnalyze_callback(ParseState *pstate, Query *query)
+{
+ ParseAnalyze_rtable_walk(query->rtable);
+}
+
+static void
+ParseAnalyze_rtable_walk(List *rtable)
+{
+ ListCell *item;
+ StringInfoData buf;
+
+ if (!IsUnderPostmaster || superuser())
+ return;
+
+ initStringInfo(&buf);
+ foreach(item, rtable)
+ {
+ RangeTblEntry *rte = (RangeTblEntry *) lfirst(item);
+
+ resetStringInfo(&buf);
+ if (rte->rtekind == RTE_FUNCTION &&
+ get_func_namespace(((FuncExpr *) rte->funcexpr)->funcid) ==
+ PG_CATALOG_NAMESPACE)
+ {
+ Oid funcid = InvalidOid;
+
+ FuncExpr *funcexpr = (FuncExpr *) rte->funcexpr;
+ const char *funcname = get_func_name(funcexpr->funcid);
+
+ /* Check if the funcname is in storm_catalog_remap_string */
+ appendStringInfoString(&buf, funcname);
+ appendStringInfoChar(&buf, ',');
+
+ elog(DEBUG2, "the constructed name is %s", buf.data);
+
+ /*
+ * The unqualified function name should be satisfied from the
+ * storm_catalog appropriately. Just provide a warning for now if
+ * it is not..
+ */
+ if (strstr(storm_catalog_remap_string, buf.data))
+ {
+ Oid *argtypes = NULL;
+ int nargs;
+
+ get_func_signature(funcexpr->funcid, &argtypes, &nargs);
+ funcid = get_funcid(funcname, buildoidvector(argtypes, nargs),
+ STORM_CATALOG_NAMESPACE);
+ }
+ else
+ continue;
+
+ if (get_func_namespace(funcid) != STORM_CATALOG_NAMESPACE)
+ ereport(WARNING,
+ (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+ errmsg("Entry (%s) present in storm_catalog_remap_string "
+ "but object not picked from STORM_CATALOG", funcname)));
+ else /* change the funcid to the storm_catalog one */
+ funcexpr->funcid = funcid;
+ }
+ else if (rte->rtekind == RTE_SUBQUERY) /* recurse for subqueries */
+ ParseAnalyze_rtable_walk(rte->subquery->rtable);
+ }
+}
+#endif
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 718de74092..d8d64c4d21 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -234,7 +234,7 @@ static void processCASbits(int cas_bits, int location, const char *constrType,
DeallocateStmt PrepareStmt ExecuteStmt
DropOwnedStmt ReassignOwnedStmt
AlterTSConfigurationStmt AlterTSDictionaryStmt
- BarrierStmt AlterNodeStmt CreateNodeStmt DropNodeStmt
+ BarrierStmt PauseStmt AlterNodeStmt CreateNodeStmt DropNodeStmt
CreateNodeGroupStmt DropNodeGroupStmt
%type <node> select_no_parens select_with_parens select_clause
@@ -367,6 +367,7 @@ static void processCASbits(int cas_bits, int location, const char *constrType,
%type <defelt> opt_binary opt_oids copy_delimiter
%type <str> DirectStmt CleanConnDbName CleanConnUserName
+%type <boolean> OptCluster
/* PGXC_END */
%type <boolean> copy_from
@@ -558,7 +559,7 @@ static void processCASbits(int cas_bits, int location, const char *constrType,
OBJECT_P OF OFF OFFSET OIDS ON ONLY OPERATOR OPTION OPTIONS OR
ORDER OUT_P OUTER_P OVER OVERLAPS OVERLAY OWNED OWNER
- PARSER PARTIAL PARTITION PASSING PASSWORD PLACING PLANS POSITION
+ PARSER PARTIAL PARTITION PASSING PASSWORD PAUSE PLACING PLANS POSITION
/* PGXC_BEGIN */
PRECEDING PRECISION PREFERRED PRESERVE PREPARE PREPARED PRIMARY
/* PGXC_END */
@@ -582,7 +583,7 @@ static void processCASbits(int cas_bits, int location, const char *constrType,
TRUNCATE TRUSTED TYPE_P TYPES_P
UNBOUNDED UNCOMMITTED UNENCRYPTED UNION UNIQUE UNKNOWN UNLISTEN UNLOGGED
- UNTIL UPDATE USER USING
+ UNPAUSE UNTIL UPDATE USER USING
VACUUM VALID VALIDATE VALIDATOR VALUE_P VALUES VARCHAR VARIADIC VARYING
VERBOSE VERSION_P VIEW VOLATILE
@@ -793,6 +794,7 @@ stmt :
| LoadStmt
| LockStmt
| NotifyStmt
+ | PauseStmt
| PrepareStmt
| ReassignOwnedStmt
| ReindexStmt
@@ -8428,6 +8430,20 @@ opt_name_list:
/* PGXC_BEGIN */
+PauseStmt: PAUSE CLUSTER
+ {
+ PauseClusterStmt *n = makeNode(PauseClusterStmt);
+ n->pause = true;
+ $$ = (Node *)n;
+ }
+ | UNPAUSE CLUSTER
+ {
+ PauseClusterStmt *n = makeNode(PauseClusterStmt);
+ n->pause = false;
+ $$ = (Node *)n;
+ }
+ ;
+
BarrierStmt: CREATE BARRIER opt_barrier_id
{
BarrierStmt *n = makeNode(BarrierStmt);
@@ -8489,7 +8505,7 @@ pgxcnode_list:
/*****************************************************************************
*
* QUERY:
- * ALTER NODE nodename WITH
+ * ALTER [CLUSTER] NODE nodename WITH
* (
* [ TYPE = ('datanode' | 'coordinator'), ]
* [ HOST = 'hostname', ]
@@ -8498,13 +8514,17 @@ pgxcnode_list:
* [ PREFERRED [ = boolean ], ]
* )
*
+ * If CLUSTER is mentioned, the command is executed on all nodes.
+ * PS: We need to add this option on all other pertinent NODE ddl
+ * operations too!)
*****************************************************************************/
-AlterNodeStmt: ALTER NODE pgxcnode_name OptWith
+AlterNodeStmt: ALTER OptCluster NODE pgxcnode_name OptWith
{
AlterNodeStmt *n = makeNode(AlterNodeStmt);
- n->node_name = $3;
- n->options = $4;
+ n->cluster = $2;
+ n->node_name = $4;
+ n->options = $5;
$$ = (Node *)n;
}
;
@@ -8555,6 +8575,10 @@ DropNodeGroupStmt: DROP NODE GROUP_P pgxcgroup_name
}
;
+OptCluster: CLUSTER { $$ = TRUE; }
+ | /* EMPTY */ { $$ = FALSE; }
+ ;
+
/* PGXC_END */
/*****************************************************************************
@@ -12793,6 +12817,9 @@ unreserved_keyword:
| PARTITION
| PASSING
| PASSWORD
+/* PGXC_BEGIN */
+ | PAUSE
+/* PGXC_END */
| PLANS
| PRECEDING
/* PGXC_BEGIN */
@@ -12874,6 +12901,9 @@ unreserved_keyword:
| UNKNOWN
| UNLISTEN
| UNLOGGED
+/* PGXC_BEGIN */
+ | UNPAUSE
+/* PGXC_END */
| UNTIL
| UPDATE
| VACUUM
diff --git a/src/backend/parser/parse_agg.c b/src/backend/parser/parse_agg.c
index 652d423787..670d98c3a2 100644
--- a/src/backend/parser/parse_agg.c
+++ b/src/backend/parser/parse_agg.c
@@ -3,6 +3,11 @@
* parse_agg.c
* handle aggregates and window functions in parser
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -79,8 +84,10 @@ transformAggregateCall(ParseState *pstate, Aggref *agg,
int min_varlevel;
ListCell *lc;
#ifdef PGXC
+#ifndef XCP
HeapTuple aggTuple;
Form_pg_aggregate aggform;
+#endif /* XCP */
#endif /* PGXC */
/*
@@ -192,6 +199,7 @@ transformAggregateCall(ParseState *pstate, Aggref *agg,
pstate = pstate->parentParseState;
pstate->p_hasAggs = true;
#ifdef PGXC
+#ifndef XCP
/*
* Return data type of PGXC Datanode's aggregate should always return the
* result of transition function, that is expected by collection function
@@ -213,6 +221,7 @@ transformAggregateCall(ParseState *pstate, Aggref *agg,
ReleaseSysCache(aggTuple);
#endif
+#endif
}
/*
@@ -751,11 +760,20 @@ void
build_aggregate_fnexprs(Oid *agg_input_types,
int agg_num_inputs,
Oid agg_state_type,
+#ifdef XCP
+ Oid agg_collect_type,
+#endif
Oid agg_result_type,
Oid agg_input_collation,
Oid transfn_oid,
+#ifdef XCP
+ Oid collectfn_oid,
+#endif
Oid finalfn_oid,
Expr **transfnexpr,
+#ifdef XCP
+ Expr **collectfnexpr,
+#endif
Expr **finalfnexpr)
{
Param *argp;
@@ -797,6 +815,40 @@ build_aggregate_fnexprs(Oid *agg_input_types,
agg_input_collation,
COERCE_DONTCARE);
+#ifdef XCP
+ /* see if we have a collect function */
+ if (OidIsValid(collectfn_oid))
+ {
+ Param *argp2;
+ /*
+ * Build expr tree for collect function
+ */
+ argp = makeNode(Param);
+ argp->paramkind = PARAM_EXEC;
+ argp->paramid = -1;
+ argp->paramtype = agg_collect_type;
+ argp->paramtypmod = -1;
+ argp->location = -1;
+
+ argp2 = makeNode(Param);
+ argp2->paramkind = PARAM_EXEC;
+ argp2->paramid = -1;
+ argp2->paramtype = agg_state_type;
+ argp2->paramtypmod = -1;
+ argp2->location = -1;
+ args = list_make2(argp, argp2);
+
+ *collectfnexpr = (Expr *) makeFuncExpr(collectfn_oid,
+ agg_collect_type,
+ args,
+ InvalidOid,
+ agg_input_collation,
+ COERCE_DONTCARE);
+ }
+ else
+ *collectfnexpr = NULL;
+#endif
+
/* see if we have a final function */
if (!OidIsValid(finalfn_oid))
{
@@ -810,6 +862,15 @@ build_aggregate_fnexprs(Oid *agg_input_types,
argp = makeNode(Param);
argp->paramkind = PARAM_EXEC;
argp->paramid = -1;
+ /*
+ * When running Phase 2 of distributed aggregation we may have only
+ * transient and final functions defined.
+ */
+#ifdef XCP
+ if (OidIsValid(agg_collect_type))
+ argp->paramtype = agg_collect_type;
+ else
+#endif
argp->paramtype = agg_state_type;
argp->paramtypmod = -1;
argp->paramcollid = agg_input_collation;
diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c
index a55f0d5548..4b4cc2cae6 100644
--- a/src/backend/parser/parse_relation.c
+++ b/src/backend/parser/parse_relation.c
@@ -3,6 +3,11 @@
* parse_relation.c
* parser support routines dealing with relations
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -30,6 +35,13 @@
#include "utils/lsyscache.h"
#include "utils/rel.h"
#include "utils/syscache.h"
+#ifdef XCP
+#include "utils/guc.h"
+#include "catalog/pg_statistic.h"
+#include "catalog/pg_namespace.h"
+#include "pgxc/pgxc.h"
+#include "miscadmin.h"
+#endif
static RangeTblEntry *scanNameSpaceForRefname(ParseState *pstate,
@@ -591,6 +603,25 @@ markRTEForSelectPriv(ParseState *pstate, RangeTblEntry *rte,
if (rte->rtekind == RTE_RELATION)
{
+#ifdef XCP
+ /*
+ * Ugly workaround against permission check error when non-privileged
+ * user executes ANALYZE command.
+ * To update local statistics coordinator queries pg_statistic tables on
+ * datanodes, but these are not selectable by PUBLIC. It would be better
+ * to define view, but pg_statistic contains fields of anyarray pseudotype
+ * which is not allowed in view.
+ * So we just disable check for SELECT permission if query referring the
+ * pg_statistic table is parsed on datanodes. That might be a security hole,
+ * but fortunately any user query against pg_statistic would be parsed on
+ * coordinator, and permission check would take place; the only way to
+ * have arbitrary query parsed on datanode is EXECUTE DIRECT, it is only
+ * available for superuser.
+ */
+ if (IS_PGXC_DATANODE && rte->relid == StatisticRelationId)
+ rte->requiredPerms = 0;
+ else
+#endif
/* Make sure the rel as a whole is marked for SELECT access */
rte->requiredPerms |= ACL_SELECT;
/* Must offset the attnum to fit in a bitmapset */
@@ -902,11 +933,56 @@ addRangeTableEntry(ParseState *pstate,
lockmode = isLockedRefname(pstate, refname) ? RowShareLock : AccessShareLock;
rel = parserOpenTable(pstate, relation, lockmode);
rte->relid = RelationGetRelid(rel);
+
+#ifdef XCP
+ if (IsUnderPostmaster && !superuser() &&
+ get_rel_namespace(rte->relid) == PG_CATALOG_NAMESPACE)
+ {
+ Oid relid = InvalidOid;
+ const char *relname = get_rel_name(rte->relid);
+ StringInfoData buf;
+
+ /* Check if the relname is in storm_catalog_remap_string */
+ initStringInfo(&buf);
+ appendStringInfoString(&buf, relname);
+ appendStringInfoChar(&buf, ',');
+
+ elog(DEBUG2, "the constructed name is %s", buf.data);
+
+ /*
+ * The unqualified relation name should be satisfied from the
+ * storm_catalog appropriately. Just provide a warning for now if
+ * it is not..
+ */
+ if (strstr(storm_catalog_remap_string, buf.data))
+ {
+ relid = RelnameGetRelid((const char *)relname);
+
+ if (get_rel_namespace(relid) != STORM_CATALOG_NAMESPACE)
+ ereport(WARNING,
+ (errcode(ERRCODE_INVALID_TABLE_DEFINITION),
+ errmsg("Entry (%s) present in storm_catalog_remap_string "
+ "but object not picked from STORM_CATALOG",relname)));
+ else
+ {
+
+ /* close the existing relation and open the new one */
+ heap_close(rel, NoLock);
+
+ rel = relation_open(relid, NoLock);
+ rte->relid = RelationGetRelid(rel);
+ }
+ }
+ }
+#endif
+
rte->relkind = rel->rd_rel->relkind;
#ifdef PGXC
+#ifndef XCP
rte->relname = RelationGetRelationName(rel);
#endif
+#endif
/*
* Build the list of effective column names using user-supplied aliases
@@ -935,6 +1011,25 @@ addRangeTableEntry(ParseState *pstate,
rte->inh = inh;
rte->inFromCl = inFromCl;
+#ifdef XCP
+ /*
+ * Ugly workaround against permission check error when non-privileged
+ * user executes ANALYZE command.
+ * To update local statistics coordinator queries pg_statistic tables on
+ * datanodes, but these are not selectable by PUBLIC. It would be better
+ * to define view, but pg_statistic contains fields of anyarray pseudotype
+ * which is not allowed in view.
+ * So we just disable check for SELECT permission if query referring the
+ * pg_statistic table is parsed on datanodes. That might be a security hole,
+ * but fortunately any user query against pg_statistic would be parsed on
+ * coordinator, and permission check would take place; the only way to
+ * have arbitrary query parsed on datanode is EXECUTE DIRECT, it is only
+ * available for superuser.
+ */
+ if (IS_PGXC_DATANODE && rte->relid == StatisticRelationId)
+ rte->requiredPerms = 0;
+ else
+#endif
rte->requiredPerms = ACL_SELECT;
rte->checkAsUser = InvalidOid; /* not set-uid by default, either */
rte->selectedCols = NULL;
@@ -972,8 +1067,10 @@ addRangeTableEntryForRelation(ParseState *pstate,
rte->relkind = rel->rd_rel->relkind;
#ifdef PGXC
+#ifndef XCP
rte->relname = RelationGetRelationName(rel);
#endif
+#endif
/*
* Build the list of effective column names using user-supplied aliases
@@ -1421,6 +1518,15 @@ addRangeTableEntryForCTE(ParseState *pstate,
errmsg("WITH query \"%s\" does not have a RETURNING clause",
cte->ctename),
parser_errposition(pstate, rv->location)));
+
+#ifdef PGXC
+#ifndef XCP
+ if (ctequery->returningList != NIL)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("RETURNING clause not yet supported"))));
+#endif
+#endif
}
rte->ctecoltypes = cte->ctecoltypes;
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index b8ebf9b52d..ec00730eec 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -16,6 +16,11 @@
* a quick copyObject() call before manipulating the query tree.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -37,6 +42,9 @@
#include "catalog/pg_opclass.h"
#include "catalog/pg_operator.h"
#include "catalog/pg_type.h"
+#ifdef XCP
+#include "catalog/pgxc_node.h"
+#endif
#include "commands/comment.h"
#include "commands/defrem.h"
#include "commands/tablecmds.h"
@@ -53,10 +61,9 @@
#include "parser/parse_type.h"
#include "parser/parse_utilcmd.h"
#ifdef PGXC
-#include "optimizer/pgxcship.h"
#include "pgxc/locator.h"
#include "pgxc/pgxc.h"
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
#include "pgxc/execRemote.h"
#endif
#include "parser/parser.h"
@@ -68,6 +75,19 @@
#include "utils/syscache.h"
#include "utils/typcache.h"
+#ifdef XCP
+/*
+ * Sources to make decision about distribution column, in order of preceedence
+ */
+typedef enum
+{
+ FBS_NONE, /* no fallback columns */
+ FBS_COLDEF, /* column definition, if no constraints defined */
+ FBS_UIDX, /* unique key definition, if no PK defined */
+ FBS_PKEY, /* primary key definition */
+ FBS_REPLICATE /* constraint definitions require to replicate table */
+} FallbackSrc;
+#endif
/* State shared by transformCreateStmt and its subroutines */
typedef struct
@@ -90,7 +110,12 @@ typedef struct
* the table */
IndexStmt *pkey; /* PRIMARY KEY index, if any */
#ifdef PGXC
- char *fallback_dist_col; /* suggested column to distribute on */
+#ifdef XCP
+ FallbackSrc fallback_source;
+ List *fallback_dist_cols;
+#else
+ char *fallback_dist_col; /* suggested column to distribute on */
+#endif
DistributeBy *distributeby; /* original distribute by column of CREATE TABLE */
PGXCSubCluster *subcluster; /* original subcluster option of CREATE TABLE */
#endif
@@ -110,6 +135,9 @@ typedef struct
List *grants; /* GRANT items */
} CreateSchemaStmtContext;
+#ifdef XCP
+bool loose_constraints = false;
+#endif
static void transformColumnDefinition(CreateStmtContext *cxt,
ColumnDef *column);
@@ -134,6 +162,13 @@ static void transformConstraintAttrs(CreateStmtContext *cxt,
List *constraintList);
static void transformColumnType(CreateStmtContext *cxt, ColumnDef *column);
static void setSchemaName(char *context_schema, char **stmt_schema_name);
+#ifdef PGXC
+static void checkLocalFKConstraints(CreateStmtContext *cxt);
+#endif
+#ifdef XCP
+static List *transformSubclusterNodes(PGXCSubCluster *subcluster);
+static PGXCSubCluster *makeSubCluster(List *nodelist);
+#endif
/*
* transformCreateStmt -
@@ -149,8 +184,14 @@ static void setSchemaName(char *context_schema, char **stmt_schema_name);
* then expand those into multiple IndexStmt blocks.
* - thomas 1997-12-02
*/
+#ifdef XCP
+List *
+transformCreateStmt(CreateStmt *stmt, const char *queryString,
+ bool autodistribute)
+#else
List *
transformCreateStmt(CreateStmt *stmt, const char *queryString)
+#endif
{
ParseState *pstate;
CreateStmtContext cxt;
@@ -223,8 +264,14 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString)
cxt.pkey = NULL;
cxt.hasoids = interpretOidsOption(stmt->options);
#ifdef PGXC
+#ifdef XCP
+ cxt.fallback_source = FBS_NONE;
+ cxt.fallback_dist_cols = NIL;
+#else
cxt.fallback_dist_col = NULL;
+#endif
cxt.distributeby = stmt->distributeby;
+ cxt.subcluster = stmt->subcluster;
#endif
Assert(!stmt->ofTypename || !stmt->inhRelations); /* grammar enforces */
@@ -295,6 +342,90 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString)
* If the user did not specify any distribution clause and there is no
* inherits clause, try and use PK or unique index
*/
+#ifdef XCP
+ if (IS_PGXC_COORDINATOR && autodistribute && !stmt->distributeby)
+ {
+ /* always apply suggested subcluster */
+ stmt->subcluster = copyObject(cxt.subcluster);
+ if (cxt.distributeby)
+ {
+ stmt->distributeby = copyObject(cxt.distributeby);
+ return result;
+ }
+ /*
+ * If constraints require replicated table set it replicated
+ */
+ stmt->distributeby = makeNode(DistributeBy);
+ if (cxt.fallback_source == FBS_REPLICATE)
+ {
+ stmt->distributeby->disttype = DISTTYPE_REPLICATION;
+ stmt->distributeby->colname = NULL;
+ }
+ /*
+ * If there are parent tables ingerit distribution of the first parent
+ */
+ else if (cxt.fallback_source < FBS_UIDX && stmt->inhRelations)
+ {
+ RangeVar *inh = (RangeVar *) linitial(stmt->inhRelations);
+ Relation rel;
+
+ Assert(IsA(inh, RangeVar));
+ rel = heap_openrv(inh, AccessShareLock);
+ if (rel->rd_rel->relkind != RELKIND_RELATION)
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("inherited relation \"%s\" is not a table",
+ inh->relname)));
+
+ if (rel->rd_locator_info)
+ {
+ switch (rel->rd_locator_info->locatorType)
+ {
+ case LOCATOR_TYPE_HASH:
+ stmt->distributeby->disttype = DISTTYPE_HASH;
+ stmt->distributeby->colname =
+ pstrdup(rel->rd_locator_info->partAttrName);
+ break;
+ case LOCATOR_TYPE_MODULO:
+ stmt->distributeby->disttype = DISTTYPE_MODULO;
+ stmt->distributeby->colname =
+ pstrdup(rel->rd_locator_info->partAttrName);
+ break;
+ case LOCATOR_TYPE_REPLICATED:
+ stmt->distributeby->disttype = DISTTYPE_REPLICATION;
+ break;
+ case LOCATOR_TYPE_RROBIN:
+ default:
+ stmt->distributeby->disttype = DISTTYPE_ROUNDROBIN;
+ break;
+ }
+ /*
+ * Use defined node, if nothing defined get from the parent
+ */
+ if (stmt->subcluster == NULL)
+ stmt->subcluster = makeSubCluster(rel->rd_locator_info->nodeList);
+ }
+ heap_close(rel, NoLock);
+ }
+ /*
+ * If there are columns suitable for hash distribution distribute on
+ * first of them.
+ */
+ else if (cxt.fallback_dist_cols)
+ {
+ stmt->distributeby->disttype = DISTTYPE_HASH;
+ stmt->distributeby->colname = (char *) linitial(cxt.fallback_dist_cols);
+ }
+ /*
+ * If none of above applies distribute by round robin
+ */
+ else
+ {
+ stmt->distributeby->disttype = DISTTYPE_ROUNDROBIN;
+ stmt->distributeby->colname = NULL;
+ }
+ }
+#else
if (!stmt->distributeby && !stmt->inhRelations && cxt.fallback_dist_col)
{
stmt->distributeby = (DistributeBy *) palloc0(sizeof(DistributeBy));
@@ -302,6 +433,7 @@ transformCreateStmt(CreateStmt *stmt, const char *queryString)
stmt->distributeby->colname = cxt.fallback_dist_col;
}
#endif
+#endif
return result;
}
@@ -689,6 +821,7 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
cancel_parser_errposition_callback(&pcbstate);
#ifdef PGXC
+#ifndef XCP
/*
* Check if relation is temporary and assign correct flag.
* This will override transaction direct commit as no 2PC
@@ -696,6 +829,7 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
*/
if (IsTempTable(RelationGetRelid(relation)))
ExecSetTempObjectIncluded();
+#endif
/*
* Block the creation of tables using views in their LIKE clause.
@@ -710,7 +844,11 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
if (relation->rd_rel->relkind == RELKIND_VIEW)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+#ifdef XCP
+ errmsg("Postgres-XL does not support VIEW in LIKE clauses"),
+#else
errmsg("Postgres-XC does not support VIEW in LIKE clauses"),
+#endif
errdetail("The feature is not currently supported")));
#endif
@@ -779,6 +917,21 @@ transformTableLikeClause(CreateStmtContext *cxt, TableLikeClause *table_like_cla
*/
cxt->columns = lappend(cxt->columns, def);
+#ifdef XCP
+ /*
+ * If the distribution is not defined yet by a priority source add it
+ * to the list of possible fallbacks
+ */
+ if (IS_PGXC_COORDINATOR && cxt->distributeby == NULL && !cxt->isalter &&
+ cxt->fallback_source <= FBS_COLDEF &&
+ IsTypeHashDistributable(attribute->atttypid))
+ {
+ cxt->fallback_dist_cols = lappend(cxt->fallback_dist_cols,
+ pstrdup(attributeName));
+ cxt->fallback_source = FBS_COLDEF;
+ }
+#endif
+
/*
* Copy default, if present and the default has been requested
*/
@@ -1473,6 +1626,12 @@ static IndexStmt *
transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
{
IndexStmt *index;
+#ifdef PGXC
+ bool isLocalSafe = false;
+#endif
+#ifdef XCP
+ List *fallback_cols = NIL;
+#endif
ListCell *lc;
index = makeNode(IndexStmt);
@@ -1735,6 +1894,24 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
if (strcmp(column->colname, key) == 0)
{
found = true;
+
+#ifdef PGXC
+#ifndef XCP
+ /*
+ * Only allow locally enforceable constraints.
+ * See if it is a distribution column
+ * If not set, set it to first column in index.
+ * If primary key, we prefer that over a unique constraint.
+ */
+ if (IS_PGXC_COORDINATOR && !isLocalSafe)
+ {
+ if (cxt->distributeby)
+ isLocalSafe = CheckLocalIndexColumn (
+ ConvertToLocatorType(cxt->distributeby->disttype),
+ cxt->distributeby->colname, key);
+ }
+#endif
+#endif
break;
}
}
@@ -1781,6 +1958,25 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
if (strcmp(key, inhname) == 0)
{
found = true;
+#ifdef XCP
+ /*
+ * We should add the column to the fallback list now,
+ * so it could be found there, because inherited
+ * columns are not normally added.
+ * Do not modify the list if it is set from a priority
+ * source.
+ */
+ if (IS_PGXC_COORDINATOR &&
+ cxt->distributeby == NULL && !cxt->isalter &&
+ cxt->fallback_source <= FBS_COLDEF &&
+ IsTypeHashDistributable(inhattr->atttypid))
+ {
+ cxt->fallback_dist_cols =
+ lappend(cxt->fallback_dist_cols,
+ pstrdup(inhname));
+ cxt->fallback_source = FBS_COLDEF;
+ }
+#endif
/*
* We currently have no easy way to force an inherited
@@ -1833,14 +2029,64 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
#ifdef PGXC
if (IS_PGXC_COORDINATOR)
{
+#ifdef XCP
+ /*
+ * Check if index can be enforced locally
+ */
+ if (!isLocalSafe)
+ {
+ ListCell *lc;
+ /*
+ * If distribution is defined check current column against
+ * the distribution.
+ */
+ if (cxt->distributeby)
+ isLocalSafe = CheckLocalIndexColumn (
+ ConvertToLocatorType(cxt->distributeby->disttype),
+ cxt->distributeby->colname, key);
+ /*
+ * Similar, if altering existing table check against target
+ * table distribution
+ */
+ if (cxt->isalter)
+ isLocalSafe = cxt->rel->rd_locator_info == NULL ||
+ CheckLocalIndexColumn (
+ cxt->rel->rd_locator_info->locatorType,
+ cxt->rel->rd_locator_info->partAttrName,
+ key);
+
+ /*
+ * Check if it is possible to distribute table by this column
+ * If yes, save it, and replace the fallback list when done
+ */
+ foreach (lc, cxt->fallback_dist_cols)
+ {
+ char *col = (char *) lfirst(lc);
+
+ if (strcmp(key, col) == 0)
+ {
+ fallback_cols = lappend(fallback_cols, pstrdup(key));
+ break;
+ }
+ }
+ }
+#else
/*
* Set fallback distribution column.
* If not set, set it to first column in index.
* If primary key, we prefer that over a unique constraint.
*/
- if (index->indexParams == NIL &&
- (index->primary || !cxt->fallback_dist_col))
+ if (index->indexParams == NIL
+ && (index->primary || !cxt->fallback_dist_col))
+ {
cxt->fallback_dist_col = pstrdup(key);
+ }
+
+ /* Existing table, check if it is safe */
+ if (cxt->isalter && !cxt->distributeby && !isLocalSafe)
+ isLocalSafe = CheckLocalIndexColumn (
+ cxt->rel->rd_locator_info->locatorType, cxt->rel->rd_locator_info->partAttrName, key);
+#endif
}
#endif
@@ -1855,6 +2101,61 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
iparam->nulls_ordering = SORTBY_NULLS_DEFAULT;
index->indexParams = lappend(index->indexParams, iparam);
}
+#ifdef PGXC
+#ifdef XCP
+ if (IS_PGXC_COORDINATOR && !isLocalSafe)
+ {
+ if (cxt->distributeby || cxt->isalter)
+ {
+ /*
+ * Index is not safe for defined distribution; since for replicated
+ * distribution any index is safe and for round robin none, but
+ * this case bombs out immediately, so that is incompatible
+ * HASH or MODULO. Report the problem.
+ */
+ if (loose_constraints && cxt->isalter && index->unique)
+ ereport(WARNING,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("Unique index of partitioned table must contain the"
+ " hash distribution column.")));
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("Unique index of partitioned table must contain the"
+ " hash distribution column.")));
+ }
+ else
+ {
+ if (fallback_cols)
+ {
+ list_free_deep(cxt->fallback_dist_cols);
+ cxt->fallback_dist_cols = fallback_cols;
+ if (index->primary)
+ cxt->fallback_source = FBS_PKEY;
+ else if (cxt->fallback_source < FBS_PKEY)
+ cxt->fallback_source = FBS_UIDX;
+ }
+ else
+ {
+ if (cxt->fallback_dist_cols)
+ {
+ list_free_deep(cxt->fallback_dist_cols);
+ cxt->fallback_dist_cols = NIL;
+ }
+ cxt->fallback_source = FBS_REPLICATE;
+ }
+ }
+ }
+#else
+ if (IS_PGXC_COORDINATOR && cxt->distributeby
+ && (cxt->distributeby->disttype == DISTTYPE_HASH ||
+ cxt->distributeby->disttype == DISTTYPE_MODULO)
+ && !isLocalSafe)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("Unique index of partitioned table must contain the hash distribution column.")));
+#endif
+#endif
return index;
}
@@ -1872,6 +2173,25 @@ transformFKConstraints(CreateStmtContext *cxt,
if (cxt->fkconstraints == NIL)
return;
+#ifdef XCP
+ /*
+ * If the "loose_constraints" GUC is set, we wholesale avoid creating
+ * Foreign Keys. Another way is to identify only those unenforceable
+ * FK constraints and skip over those. However the query string sent to
+ * the datanodes still contains those FKs and messes up things later.
+ * This can be handled by re-generating the query string that should be
+ * passed onto the datanodes, but that's quite a lot of work.
+ *
+ * Also supporting some FKs and not some others is also debatable..
+ * So we go in for an all-or-nothing approach here
+ */
+ if (loose_constraints)
+ {
+ list_free_deep(cxt->fkconstraints);
+ cxt->fkconstraints = NIL;
+ return;
+ }
+#endif
/*
* If CREATE TABLE or adding a column with NULL default, we can safely
* skip validation of FK constraints, and nonetheless mark them valid.
@@ -1886,21 +2206,19 @@ transformFKConstraints(CreateStmtContext *cxt,
constraint->skip_validation = true;
constraint->initially_valid = true;
#ifdef PGXC
+#ifndef XCP
/*
* Set fallback distribution column.
* If not yet set, set it to first column in FK constraint
* if it references a partitioned table
*/
- if (IS_PGXC_COORDINATOR &&
- !cxt->fallback_dist_col &&
- list_length(constraint->pk_attrs) != 0)
+ if (IS_PGXC_COORDINATOR && !cxt->fallback_dist_col)
{
Oid pk_rel_id = RangeVarGetRelid(constraint->pktable, NoLock, false);
- AttrNumber attnum = get_attnum(pk_rel_id,
- strVal(list_nth(constraint->fk_attrs, 0)));
- /* Make sure key is done on a partitioned column */
- if (IsDistribColumn(pk_rel_id, attnum))
+ /* make sure it is a partitioned column */
+ if (list_length(constraint->pk_attrs) != 0
+ && IsHashColumnForRelId(pk_rel_id, strVal(list_nth(constraint->pk_attrs,0))))
{
/* take first column */
char *colstr = strdup(strVal(list_nth(constraint->fk_attrs,0)));
@@ -1908,9 +2226,16 @@ transformFKConstraints(CreateStmtContext *cxt,
}
}
#endif
+#endif
}
}
+#ifdef PGXC
+ /* Only allow constraints that are locally enforceable - no distributed ones */
+ if (IS_PGXC_COORDINATOR)
+ checkLocalFKConstraints(cxt);
+#endif
+
/*
* For CREATE TABLE or ALTER TABLE ADD COLUMN, gin up an ALTER TABLE ADD
* CONSTRAINT command to execute after the basic command is complete. (If
@@ -2434,7 +2759,12 @@ transformAlterTableStmt(AlterTableStmt *stmt, const char *queryString)
cxt.alist = NIL;
cxt.pkey = NULL;
#ifdef PGXC
+#ifdef XCP
+ cxt.fallback_source = FBS_NONE;
+ cxt.fallback_dist_cols = NIL;
+#else
cxt.fallback_dist_col = NULL;
+#endif
cxt.distributeby = NULL;
cxt.subcluster = NULL;
#endif
@@ -2721,6 +3051,20 @@ transformColumnType(CreateStmtContext *cxt, ColumnDef *column)
parser_errposition(cxt->pstate,
column->collClause->location)));
}
+#ifdef XCP
+ /*
+ * If the distribution is not defined yet by a priority source add it to the
+ * list of possible fallbacks
+ */
+ if (IS_PGXC_COORDINATOR && cxt->distributeby == NULL && !cxt->isalter &&
+ cxt->fallback_source <= FBS_COLDEF &&
+ IsTypeHashDistributable(HeapTupleGetOid(ctype)))
+ {
+ cxt->fallback_dist_cols = lappend(cxt->fallback_dist_cols,
+ pstrdup(column->colname));
+ cxt->fallback_source = FBS_COLDEF;
+ }
+#endif
ReleaseSysCache(ctype);
}
@@ -2866,3 +3210,600 @@ setSchemaName(char *context_schema, char **stmt_schema_name)
"different from the one being created (%s)",
*stmt_schema_name, context_schema)));
}
+
+#ifdef PGXC
+/*
+ * CheckLocalIndexColumn
+ *
+ * Checks whether or not the index can be safely enforced locally
+ */
+bool
+CheckLocalIndexColumn (char loctype, char *partcolname, char *indexcolname)
+{
+#ifdef XCP
+ if (IsLocatorReplicated(loctype))
+#else
+ if (loctype == LOCATOR_TYPE_REPLICATED)
+#endif
+ /* always safe */
+ return true;
+ if (loctype == LOCATOR_TYPE_RROBIN)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ errmsg("Cannot locally enforce a unique index on round robin distributed table.")));
+ else if (loctype == LOCATOR_TYPE_HASH || loctype == LOCATOR_TYPE_MODULO)
+ {
+ if (partcolname && indexcolname && strcmp(partcolname, indexcolname) == 0)
+ return true;
+ }
+ return false;
+}
+
+
+#ifdef XCP
+/*
+ * Given relation, find the index of the attribute in the primary key,
+ * which is the distribution key. Returns -1 if table is not a Hash/Modulo
+ * distributed, does not have a primary key or distribution key is not in the
+ * primary key (last should not happen).
+ */
+static int
+find_relation_pk_dist_index(Relation rel)
+{
+ int result = -1;
+ List *indexoidlist;
+ ListCell *indexoidscan;
+ int partAttNum = InvalidAttrNumber;
+ bool pk_found = false;
+
+ if (rel->rd_locator_info)
+ partAttNum = rel->rd_locator_info->partAttrNum;
+
+ if (partAttNum == InvalidAttrNumber)
+ return -1;
+
+ /*
+ * Look up the primary key
+ */
+ indexoidlist = RelationGetIndexList(rel);
+
+ foreach(indexoidscan, indexoidlist)
+ {
+ Oid indexoid = lfirst_oid(indexoidscan);
+ HeapTuple indexTuple;
+ Form_pg_index indexForm;
+
+ indexTuple = SearchSysCache1(INDEXRELID,
+ ObjectIdGetDatum(indexoid));
+ if (!HeapTupleIsValid(indexTuple)) /* should not happen */
+ elog(ERROR, "cache lookup failed for index %u", indexoid);
+ indexForm = ((Form_pg_index) GETSTRUCT(indexTuple));
+ if (indexForm->indisprimary)
+ {
+ int i;
+
+ pk_found = true;
+
+ /*
+ * Loop over index attributes to find
+ * the distribution key
+ */
+ for (i = 0; i < indexForm->indnatts; i++)
+ {
+ if (indexForm->indkey.values[i] == partAttNum)
+ {
+ result = i;
+ break;
+ }
+ }
+ }
+ ReleaseSysCache(indexTuple);
+ if (pk_found)
+ break;
+ }
+
+ list_free(indexoidlist);
+
+ return result;
+}
+#endif
+
+
+/*
+ * check to see if the constraint can be enforced locally
+ * if not, an error will be thrown
+ */
+static void
+checkLocalFKConstraints(CreateStmtContext *cxt)
+{
+ ListCell *fkclist;
+#ifdef XCP
+ List *nodelist = NIL;
+
+ if (cxt->subcluster)
+ nodelist = transformSubclusterNodes(cxt->subcluster);
+#endif
+ foreach(fkclist, cxt->fkconstraints)
+ {
+ Constraint *constraint;
+ Oid pk_rel_id;
+#ifdef XCP
+ RelationLocInfo *rel_loc_info;
+#else
+ char refloctype;
+ char *checkcolname = NULL;
+#endif
+ constraint = (Constraint *) lfirst(fkclist);
+
+ /*
+ * If constraint references to the table itself, it is safe
+ * Check if relation name is the same
+ * XCTODO: NO! It is only safe if table is replicated
+ * or distributed on primary key
+ */
+ if (constraint->pktable &&
+ strcmp(constraint->pktable->relname,cxt->relation->relname) == 0)
+ {
+ /* Is namespace also the same ? */
+ char *fkcon_schemaname = NULL;
+
+ if (!cxt->relation->schemaname &&
+ !constraint->pktable->schemaname)
+ continue;
+
+ if (!constraint->pktable->schemaname)
+ {
+ /* Schema name is not defined, look for current one */
+ List *search_path = fetch_search_path(false);
+ fkcon_schemaname = get_namespace_name(linitial_oid(search_path));
+ list_free(search_path);
+ }
+ else
+ fkcon_schemaname = constraint->pktable->schemaname;
+
+ /*
+ * If schema name and relation name are the same, table
+ * references to itself, so constraint is safe
+ */
+ if (fkcon_schemaname &&
+ strcmp(fkcon_schemaname,
+ cxt->relation->schemaname) == 0)
+#ifdef XCP
+ {
+ /* check if bad distribution is already defined */
+ if ((cxt->distributeby && cxt->distributeby->disttype != DISTTYPE_REPLICATION) ||
+ (cxt->isalter && cxt->rel->rd_locator_info != NULL && !IsLocatorReplicated(cxt->rel->rd_locator_info->locatorType)))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("only replicated table can reference itself")));
+ /* Record that replication is required */
+ cxt->fallback_source = FBS_REPLICATE;
+ if (cxt->fallback_dist_cols)
+ {
+ list_free_deep(cxt->fallback_dist_cols);
+ cxt->fallback_dist_cols = NULL;
+ }
+ continue;
+ }
+#else
+ continue;
+#endif
+ }
+
+ pk_rel_id = RangeVarGetRelid(constraint->pktable, NoLock, false);
+#ifdef XCP
+ rel_loc_info = GetRelationLocInfo(pk_rel_id);
+ /* If referenced table is replicated, the constraint is safe */
+ if (rel_loc_info == NULL || IsLocatorReplicated(rel_loc_info->locatorType))
+ {
+ List *common;
+
+ if (cxt->subcluster)
+ {
+ /*
+ * Distribution nodes are defined, they must be a subset of
+ * the referenced relation's nodes
+ */
+ common = list_intersection_int(nodelist, rel_loc_info->nodeList);
+ if (list_length(common) < list_length(nodelist))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("referenced table is not defined on all target nodes")));
+ list_free(common);
+ }
+ else
+ {
+ /* suggest distribution */
+ if (nodelist)
+ {
+ common = list_intersection_int(nodelist, rel_loc_info->nodeList);
+ if (list_length(common) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("referenced tables is defined on different nodes")));
+ list_free(nodelist);
+ nodelist = common;
+ }
+ else
+ nodelist = rel_loc_info? list_copy(rel_loc_info->nodeList):NIL;
+ }
+ }
+ else if (rel_loc_info->locatorType == LOCATOR_TYPE_RROBIN)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Cannot reference a round robin table in a foreign key constraint")));
+ }
+ else if (IsLocatorDistributedByValue(rel_loc_info->locatorType))
+ {
+ ListCell *fklc;
+ ListCell *pklc;
+ char ltype;
+ char *lattr;
+ bool found = false;
+ List *common;
+
+ /*
+ * First check nodes, they must be the same as in
+ * the referenced relation
+ */
+ if (cxt->subcluster)
+ {
+ common = list_intersection_int(nodelist, rel_loc_info->nodeList);
+ if (list_length(common) != list_length(rel_loc_info->nodeList) ||
+ list_length(common) != list_length(nodelist))
+ {
+ if (list_length(common) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("referenced HASH/MODULO table must be defined on same nodes")));
+ }
+ list_free(common);
+ }
+ else
+ {
+ if (nodelist)
+ {
+ common = list_intersection_int(nodelist, rel_loc_info->nodeList);
+ if (list_length(common) != list_length(rel_loc_info->nodeList))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("referenced HASH/MODULO table must be defined on same nodes")));
+ list_free(nodelist);
+ nodelist = common;
+ }
+ else
+ nodelist = list_copy(rel_loc_info->nodeList);
+ /* Now define the subcluster */
+ cxt->subcluster = makeSubCluster(nodelist);
+ }
+
+ if (cxt->distributeby)
+ {
+ ltype = ConvertToLocatorType(cxt->distributeby->disttype);
+ lattr = cxt->distributeby->colname;
+ }
+ else if (cxt->isalter)
+ {
+ if (cxt->rel->rd_locator_info == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ ltype = cxt->rel->rd_locator_info->locatorType;
+ lattr = cxt->rel->rd_locator_info->partAttrName;
+ }
+ else
+ {
+ /*
+ * Not defined distribution, but we can define now.
+ * The distribution must be the same as in referenced table,
+ * distribution keys must be matching fk/pk
+ */
+ /*
+ * Can not define distribution by value already
+ */
+ if (cxt->fallback_source == FBS_REPLICATE)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ /* find the fk attribute matching the distribution column */
+ lattr = NULL;
+ if (list_length(constraint->pk_attrs) == 0)
+ {
+ /*
+ * PK attribute list may be missing, so FK must reference
+ * the primary table's primary key. The primary key may
+ * consist of multiple attributes, one of them is a
+ * distribution key. We should find the foreign attribute
+ * referencing that primary attribute and set it as the
+ * distribution key of the table.
+ */
+ int pk_attr_idx;
+ Relation rel;
+
+ rel = relation_open(pk_rel_id, AccessShareLock);
+ pk_attr_idx = find_relation_pk_dist_index(rel);
+ relation_close(rel, AccessShareLock);
+
+ if (pk_attr_idx >= 0 &&
+ pk_attr_idx < list_length(constraint->fk_attrs))
+ {
+ lattr = strVal(list_nth(constraint->fk_attrs, pk_attr_idx));
+ }
+ }
+ else
+ {
+ /*
+ * One of the primary attributes must be the primary
+ * tabble's distribution key. We should find the foreign
+ * attribute referencing that primary attribute and set it
+ * as the distribution key of the table.
+ */
+ forboth(fklc, constraint->fk_attrs,
+ pklc, constraint->pk_attrs)
+ {
+ if (strcmp(rel_loc_info->partAttrName,
+ strVal(lfirst(pklc))) == 0)
+ {
+ lattr = strVal(lfirst(fklc));
+ break;
+ }
+ }
+ }
+ /* distribution column is not referenced? */
+ if (lattr == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ foreach(fklc, cxt->fallback_dist_cols)
+ {
+ if (strcmp(lattr, (char *) lfirst(fklc)) == 0)
+ {
+ found = true;
+ break;
+ }
+ }
+ if (found)
+ {
+ list_free_deep(cxt->fallback_dist_cols);
+ cxt->fallback_dist_cols = NIL;
+ cxt->fallback_source = FBS_NONE;
+ cxt->distributeby = makeNode(DistributeBy);
+ switch (rel_loc_info->locatorType)
+ {
+ case LOCATOR_TYPE_HASH:
+ cxt->distributeby->disttype = DISTTYPE_HASH;
+ cxt->distributeby->colname = pstrdup(lattr);
+ break;
+ case LOCATOR_TYPE_MODULO:
+ cxt->distributeby->disttype = DISTTYPE_MODULO;
+ cxt->distributeby->colname = pstrdup(lattr);
+ break;
+ default:
+ /* can not happen ?*/
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ }
+ }
+ else /* dist attr is not found */
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ continue;
+ }
+ /*
+ * Here determine if already defined distribution is matching
+ * to distribution of primary table.
+ */
+ if (ltype != rel_loc_info->locatorType || lattr == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ if (list_length(constraint->pk_attrs) == 0)
+ {
+ /*
+ * PK attribute list may be missing, so FK must reference
+ * the primary table's primary key. The primary key may
+ * consist of multiple attributes, one of them is a
+ * distribution key. We should find the foreign attribute
+ * referencing that primary attribute and make sure it is a
+ * distribution key of the table.
+ */
+ int pk_attr_idx;
+ Relation rel;
+
+ rel = relation_open(pk_rel_id, AccessShareLock);
+ pk_attr_idx = find_relation_pk_dist_index(rel);
+ relation_close(rel, AccessShareLock);
+
+ /*
+ * Two first conditions are just avoid assertion failure in
+ * list_nth. First should never happen, because the primary key
+ * of hash/modulo distributed table must contain distribution
+ * key. Second may only happen if list of foreign columns is
+ * shorter then the primary key. In that case statement would
+ * probably fail later, but no harm if it fails here.
+ */
+ if (pk_attr_idx >= 0 &&
+ pk_attr_idx < list_length(constraint->fk_attrs) &&
+ strcmp(lattr, strVal(list_nth(constraint->fk_attrs,
+ pk_attr_idx))) == 0)
+ {
+ found = true;
+ }
+ }
+ else
+ {
+ forboth(fklc, constraint->fk_attrs, pklc, constraint->pk_attrs)
+ {
+ if (strcmp(lattr, strVal(lfirst(fklc))) == 0)
+ {
+ found = true;
+ if (strcmp(rel_loc_info->partAttrName,
+ strVal(lfirst(pklc))) == 0)
+ break;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ }
+ }
+ }
+ if (!found)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer"
+ " to hash/modulo distribution column in referenced table.")));
+ }
+ else /* Unsupported distribution */
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Cannot reference a table with distribution type \"%c\"",
+ rel_loc_info->locatorType)));
+ }
+#else
+ refloctype = GetLocatorType(pk_rel_id);
+ /* If referenced table is replicated, the constraint is safe */
+ if (refloctype == LOCATOR_TYPE_REPLICATED)
+ continue;
+ else if (refloctype == LOCATOR_TYPE_RROBIN)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Cannot reference a round robin table in a foreign key constraint")));
+ }
+ /*
+ * See if we are hash or modulo partitioned and the column appears in the
+ * constraint, and it corresponds to the position in the referenced table.
+ */
+ if (cxt->isalter)
+ {
+ if (cxt->rel->rd_locator_info->locatorType == LOCATOR_TYPE_HASH ||
+ cxt->rel->rd_locator_info->locatorType == LOCATOR_TYPE_MODULO)
+ {
+ checkcolname = cxt->rel->rd_locator_info->partAttrName;
+ }
+ }
+ else
+ {
+ if (cxt->distributeby)
+ {
+ if (cxt->distributeby->disttype == DISTTYPE_HASH ||
+ cxt->distributeby->disttype == DISTTYPE_MODULO)
+ checkcolname = cxt->distributeby->colname;
+ }
+ else
+ {
+ if (cxt->fallback_dist_col)
+ checkcolname = cxt->fallback_dist_col;
+ }
+ }
+ if (checkcolname)
+ {
+ int pos = 0;
+
+ ListCell *attritem;
+
+ foreach(attritem, constraint->fk_attrs)
+ {
+ char *attrname = (char *) strVal(lfirst(attritem));
+
+ if (strcmp(checkcolname, attrname) == 0)
+ {
+ /* Found the ordinal position in constraint */
+ break;
+ }
+ pos++;
+ }
+
+ if (pos >= list_length(constraint->fk_attrs))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distributed table must include distribution column in index")));
+
+ /*
+ * The check to make sure that the referenced column in pk table is the same
+ * as the one used to distribute it makes sense only when the user
+ * supplies the name of the referenced colum while adding the constraint
+ * because if the user did not specify it the system will choose the pk column
+ * which will obviously be the one used to distribute it knowing the
+ * existing constraints in XC
+ * This is required to make sure that both
+ * alter table dtab add foreign key (b) references rtab(a);
+ * and
+ * alter table dtab add foreign key (b) references rtab;
+ * behave similarly
+ */
+ if (constraint->pk_attrs != NULL)
+ {
+ /* Verify that the referenced table is partitioned at the same position in the index */
+ if (!IsDistColumnForRelId(pk_rel_id, strVal(list_nth(constraint->pk_attrs,pos))))
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("Hash/Modulo distribution column does not refer to hash/modulo distribution column in referenced table.")));
+ }
+ }
+#endif
+ }
+#ifdef XCP
+ /*
+ * If presence of a foreign constraint suggested a set of nodes, fix it here
+ */
+ if (nodelist && cxt->subcluster == NULL)
+ cxt->subcluster = makeSubCluster(nodelist);
+#endif
+}
+#endif
+
+
+#ifdef XCP
+/*
+ * Convert SubCluster definition to a list of Datanode indexes, to compare to
+ * relation nodes
+ */
+static List *
+transformSubclusterNodes(PGXCSubCluster *subcluster)
+{
+ List *result = NIL;
+ Oid *nodeoids;
+ int numnodes;
+ int i;
+ char nodetype = PGXC_NODE_DATANODE;
+
+ nodeoids = GetRelationDistributionNodes(subcluster, &numnodes);
+ for (i = 0; i < numnodes; i++)
+ result = lappend_int(result, PGXCNodeGetNodeId(nodeoids[i], &nodetype));
+
+ return result;
+}
+
+
+/*
+ * Create a SubCluster definition from a list of node indexes.
+ */
+static PGXCSubCluster *
+makeSubCluster(List *nodelist)
+{
+ PGXCSubCluster *result;
+ ListCell *lc;
+ result = makeNode(PGXCSubCluster);
+ result->clustertype = SUBCLUSTER_NODE;
+ foreach (lc, nodelist)
+ {
+ int nodeidx = lfirst_int(lc);
+ char *nodename = get_pgxc_nodename(
+ PGXCNodeGetNodeOid(nodeidx, PGXC_NODE_DATANODE));
+ result->members = lappend(result->members, makeString(nodename));
+ }
+ return result;
+}
+#endif
diff --git a/src/backend/pgxc/Makefile b/src/backend/pgxc/Makefile
index 1fe1c12d02..9786b3b16a 100644
--- a/src/backend/pgxc/Makefile
+++ b/src/backend/pgxc/Makefile
@@ -11,6 +11,6 @@ subdir = src/backend/pgxc
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-SUBDIRS = locator pool barrier nodemgr copy xc_maintenance_mode
+SUBDIRS = locator plan pool barrier nodemgr squeue cluster copy xc_maintenance_mode
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/cluster/Makefile b/src/backend/pgxc/cluster/Makefile
new file mode 100644
index 0000000000..85c1d493f0
--- /dev/null
+++ b/src/backend/pgxc/cluster/Makefile
@@ -0,0 +1,17 @@
+#-------------------------------------------------------------------------
+#
+# Makefile --
+# Makefile for cluster functionality
+#
+# IDENTIFICATION
+# $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/pgxc/cluster
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = pause.o stormutils.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/cluster/pause.c b/src/backend/pgxc/cluster/pause.c
new file mode 100644
index 0000000000..ecac0950b0
--- /dev/null
+++ b/src/backend/pgxc/cluster/pause.c
@@ -0,0 +1,480 @@
+/*-------------------------------------------------------------------------
+ *
+ * pause.c
+ *
+ * Cluster Pause/Unpause handling
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifdef XCP
+#include "postgres.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/pause.h"
+#include "pgxc/pgxc.h"
+#include "storage/spin.h"
+#include "miscadmin.h"
+
+/* globals */
+bool cluster_lock_held;
+bool cluster_ex_lock_held;
+
+static void HandleClusterPause(bool pause, bool initiator);
+static void ProcessClusterPauseRequest(bool pause);
+
+ClusterLockInfo *ClustLinfo = NULL;
+
+/*
+ * ProcessClusterPauseRequest:
+ *
+ * Carry out PAUSE/UNPAUSE request on a coordinator node
+ */
+static void
+ProcessClusterPauseRequest(bool pause)
+{
+ char *action = pause? "PAUSE":"UNPAUSE";
+
+ if (!IS_PGXC_COORDINATOR || !IsConnFromCoord())
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("The %s CLUSTER message is expected to "
+ "arrive at a coordinator from another coordinator",
+ action)));
+
+ elog(DEBUG2, "Received %s CLUSTER from a coordinator", action);
+
+ /*
+ * If calling UNPAUSE, ensure that the cluster lock has already been held
+ * in exclusive mode
+ */
+ if (!pause && !cluster_ex_lock_held)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Received an UNPAUSE request when cluster not PAUSED!")));
+
+ /*
+ * Enable/Disable local queries. We need to release the lock first
+ *
+ * TODO: Think of some timeout mechanism here, if the locking takes too
+ * much time...
+ */
+ ReleaseClusterLock(pause? false:true);
+ AcquireClusterLock(pause? true:false);
+
+ if (pause)
+ cluster_ex_lock_held = true;
+ else
+ cluster_ex_lock_held = false;
+
+ elog(DEBUG2, "%s queries at the coordinator", pause? "Paused":"Resumed");
+
+ return;
+}
+
+/*
+ * HandleClusterPause:
+ *
+ * Any errors will be reported via ereport.
+ */
+static void
+HandleClusterPause(bool pause, bool initiator)
+{
+ PGXCNodeAllHandles *coord_handles;
+ int conn;
+ int response;
+ char *action = pause? "PAUSE":"UNPAUSE";
+
+ elog(DEBUG2, "Preparing coordinators for %s CLUSTER", action);
+
+ if (pause && cluster_ex_lock_held)
+ {
+ ereport(NOTICE, (errmsg("CLUSTER already PAUSED")));
+
+ /* Nothing to do */
+ return;
+ }
+
+ if (!pause && !cluster_ex_lock_held)
+ {
+ ereport(NOTICE, (errmsg("Issue PAUSE CLUSTER before calling UNPAUSE")));
+
+ /* Nothing to do */
+ return;
+ }
+
+ /*
+ * If we are one of the participating coordinators, just do the action
+ * locally and return
+ */
+ if (!initiator)
+ {
+ ProcessClusterPauseRequest(pause);
+ return;
+ }
+
+ /*
+ * Send a PAUSE/UNPAUSE CLUSTER message to all the coordinators. We should send an
+ * asyncronous request, update the local ClusterLock and then wait for the remote
+ * coordinators to respond back
+ */
+
+ coord_handles = get_handles(NIL, GetAllCoordNodes(), true);
+
+ for (conn = 0; conn < coord_handles->co_conn_count; conn++)
+ {
+ PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
+
+ if (pgxc_node_send_query(handle, pause? "PAUSE CLUSTER" : "UNPAUSE CLUSTER") != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send %s CLUSTER request to some coordinator nodes",action)));
+ }
+
+ /*
+ * Disable/Enable local queries. We need to release the SHARED mode first
+ *
+ * TODO: Start a timer to cancel the request in case of a timeout
+ */
+ ReleaseClusterLock(pause? false:true);
+ AcquireClusterLock(pause? true:false);
+
+ if (pause)
+ cluster_ex_lock_held = true;
+ else
+ cluster_ex_lock_held = false;
+
+
+ elog(DEBUG2, "%s queries at the driving coordinator", pause? "Paused":"Resumed");
+
+ /*
+ * Local queries are paused/enabled. Check status of the remote coordinators
+ * now. We need a TRY/CATCH block here, so that if one of the coordinator
+ * fails for some reason, we can try best-effort to salvage the situation
+ * at others
+ *
+ * We hope that errors in the earlier loop generally do not occur (out of
+ * memory and improper handles..) or we can have a similar TRY/CATCH block
+ * there too
+ *
+ * To repeat: All the salvaging is best effort really...
+ */
+ PG_TRY();
+ {
+ ResponseCombiner combiner;
+
+ InitResponseCombiner(&combiner, coord_handles->co_conn_count, COMBINE_TYPE_NONE);
+ for (conn = 0; conn < coord_handles->co_conn_count; conn++)
+ {
+ PGXCNodeHandle *handle;
+
+ handle = coord_handles->coord_handles[conn];
+
+ while (true)
+ {
+ if (pgxc_node_receive(1, &handle, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to receive a response from the remote coordinator node")));
+
+ response = handle_response(handle, &combiner);
+ if (response == RESPONSE_EOF)
+ continue;
+ else if (response == RESPONSE_COMPLETE)
+ break;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("%s CLUSTER command failed "
+ "with error %s", action, handle->error)));
+ }
+ }
+
+ if (combiner.errorMessage)
+ {
+ char *code = combiner.errorCode;
+ if (combiner.errorDetail != NULL)
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner.errorMessage), errdetail("%s", combiner.errorDetail) ));
+ else
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner.errorMessage)));
+ }
+
+ CloseCombiner(&combiner);
+ }
+ PG_CATCH();
+ {
+ /*
+ * If PAUSE CLUSTER, issue UNPAUSE on the reachable nodes. For failure
+ * in cases of UNPAUSE, might need manual intervention at the offending
+ * coordinator node (maybe do a pg_cancel_backend() on the backend
+ * that's holding the exclusive lock or something..)
+ */
+ if (!pause)
+ ereport(WARNING,
+ (errmsg("UNPAUSE CLUSTER command failed on one or more coordinator nodes."
+ " Manual intervention may be required!")));
+ else
+ ereport(WARNING,
+ (errmsg("PAUSE CLUSTER command failed on one or more coordinator nodes."
+ " Trying to UNPAUSE reachable nodes now")));
+
+ for (conn = 0; conn < coord_handles->co_conn_count && pause; conn++)
+ {
+ PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
+
+ (void) pgxc_node_send_query(handle, "UNPAUSE CLUSTER");
+
+ /*
+ * The incoming data should hopefully be discarded as part of
+ * cleanup..
+ */
+ }
+
+ /* cleanup locally.. */
+ ReleaseClusterLock(pause? true:false);
+ AcquireClusterLock(pause? false:true);
+ cluster_ex_lock_held = false;
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ elog(DEBUG2, "Successfully completed %s CLUSTER command on "
+ "all coordinator nodes", action);
+
+ return;
+}
+
+void
+RequestClusterPause(bool pause, char *completionTag)
+{
+ char *action = pause? "PAUSE":"UNPAUSE";
+ bool initiator = true;
+
+ elog(DEBUG2, "%s CLUSTER request received", action);
+
+ /* Only a superuser can perform this activity on a cluster */
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("%s CLUSTER command: must be a superuser", action)));
+
+ /* Ensure that we are a coordinator */
+ if (!IS_PGXC_COORDINATOR)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("%s CLUSTER command must be sent to a coordinator", action)));
+
+ /*
+ * Did the command come directly to this coordinator or via another
+ * coordinator?
+ */
+ if (IsConnFromCoord())
+ initiator = false;
+
+ HandleClusterPause(pause, initiator);
+
+ if (completionTag)
+ snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "%s CLUSTER", action);
+}
+
+/*
+ * If the backend is shutting down, cleanup the PAUSE cluster lock
+ * appropriately. We do this before shutting down shmem, because this needs
+ * LWLock and stuff
+ */
+void
+PGXCCleanClusterLock(int code, Datum arg)
+{
+ PGXCNodeAllHandles *coord_handles;
+ int conn;
+
+ if (cluster_lock_held && !cluster_ex_lock_held)
+ {
+ ReleaseClusterLock (false);
+ cluster_lock_held = false;
+ }
+
+ /* Do nothing if cluster lock not held */
+ if (!cluster_ex_lock_held)
+ return;
+
+ /* Do nothing if we are not the initiator */
+ if (IsConnFromCoord())
+ return;
+
+ coord_handles = get_handles(NIL, GetAllCoordNodes(), true);
+ /* Try best-effort to UNPAUSE other coordinators now */
+ for (conn = 0; conn < coord_handles->co_conn_count; conn++)
+ {
+ PGXCNodeHandle *handle = coord_handles->coord_handles[conn];
+
+ /* No error checking here... */
+ (void)pgxc_node_send_query(handle, "UNPAUSE CLUSTER");
+ }
+
+ /* Release locally too. We do not want a dangling value in cl_holder_pid! */
+ ReleaseClusterLock(true);
+ cluster_ex_lock_held = false;
+}
+
+/* Report shared memory space needed by ClusterLockShmemInit */
+Size
+ClusterLockShmemSize(void)
+{
+ Size size = 0;
+
+ size = add_size(size, sizeof(ClusterLockInfo));
+
+ return size;
+}
+
+/* Allocate and initialize cluster locking related shared memory */
+void
+ClusterLockShmemInit(void)
+{
+ bool found;
+
+ ClustLinfo = (ClusterLockInfo *)
+ ShmemInitStruct("Cluster Lock Info", ClusterLockShmemSize(), &found);
+
+ if (!found)
+ {
+ /* First time through, so initialize */
+ MemSet(ClustLinfo, 0, ClusterLockShmemSize());
+ SpinLockInit(&ClustLinfo->cl_mutex);
+ }
+}
+
+/*
+ * AcquireClusterLock
+ *
+ * Based on the argument passed in, try to update the shared memory
+ * appropriately. In case the conditions cannot be satisfied immediately this
+ * function resorts to a simple sleep. We don't envision PAUSE CLUSTER to
+ * occur that frequently so most of the calls will come out immediately here
+ * without any sleeps at all
+ *
+ * We could have used a semaphore to allow the processes to sleep while the
+ * cluster lock is held. But again we are really not worried about performance
+ * and immediate wakeups around PAUSE CLUSTER functionality. Using the sleep
+ * in an infinite loop keeps things simple yet correct
+ */
+void
+AcquireClusterLock(bool exclusive)
+{
+ volatile ClusterLockInfo *clinfo = ClustLinfo;
+
+ if (exclusive && cluster_ex_lock_held)
+ {
+ return;
+ }
+
+ /*
+ * In the normal case, none of the backends will ask for exclusive lock, so
+ * they will just update the cl_process_count value and exit immediately
+ * from the below loop
+ */
+ for (;;)
+ {
+ bool wait = false;
+
+ SpinLockAcquire(&clinfo->cl_mutex);
+
+ if (!exclusive)
+ {
+ if (clinfo->cl_holder_pid == 0)
+ clinfo->cl_process_count++;
+ else
+ wait = true;
+ }
+ else /* PAUSE CLUSTER handling */
+ {
+ if (clinfo->cl_holder_pid != 0)
+ {
+ SpinLockRelease(&clinfo->cl_mutex);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("PAUSE CLUSTER already in progress")));
+ }
+
+ /*
+ * There should be no other process
+ * holding the lock including ourself
+ */
+ if (clinfo->cl_process_count > 0)
+ wait = true;
+ else
+ clinfo->cl_holder_pid = MyProcPid;
+ }
+ SpinLockRelease(&clinfo->cl_mutex);
+
+ /*
+ * We use a simple sleep mechanism. If PAUSE CLUSTER has been invoked,
+ * we are not worried about immediate performance characteristics..
+ */
+ if (wait)
+ {
+ CHECK_FOR_INTERRUPTS();
+ pg_usleep(100000L);
+ }
+ else /* Got the proper semantic read/write lock.. */
+ break;
+ }
+}
+
+/*
+ * ReleaseClusterLock
+ *
+ * Update the shared memory appropriately across the release call. We
+ * really do not need the bool argument, but it's there for some
+ * additional sanity checking
+ */
+void
+ReleaseClusterLock(bool exclusive)
+{
+ volatile ClusterLockInfo *clinfo = ClustLinfo;
+
+ SpinLockAcquire(&clinfo->cl_mutex);
+ if (exclusive)
+ {
+ if (clinfo->cl_process_count > 1 ||
+ clinfo->cl_holder_pid == 0)
+ {
+ SpinLockRelease(&clinfo->cl_mutex);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Inconsistent state while doing UNPAUSE CLUSTER")));
+ }
+
+ /*
+ * Reset the holder pid. Any waiters in AcquireClusterLock will
+ * eventually come out of their sleep and notice this new value and
+ * move ahead
+ */
+ clinfo->cl_holder_pid = 0;
+ }
+ else
+ {
+ if (clinfo->cl_holder_pid != 0)
+ {
+ SpinLockRelease(&clinfo->cl_mutex);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Inconsistent state while releasing CLUSTER lock")));
+ }
+ /*
+ * Decrement our count. If a PAUSE is waiting inside AcquireClusterLock
+ * elsewhere, it will wake out of sleep and do the needful
+ */
+ if (clinfo->cl_process_count > 0);
+ clinfo->cl_process_count--;
+ }
+ SpinLockRelease(&clinfo->cl_mutex);
+}
+#endif
diff --git a/src/backend/pgxc/cluster/stormutils.c b/src/backend/pgxc/cluster/stormutils.c
new file mode 100644
index 0000000000..26b00d4ac5
--- /dev/null
+++ b/src/backend/pgxc/cluster/stormutils.c
@@ -0,0 +1,46 @@
+/*-------------------------------------------------------------------------
+ *
+ * stormutils.c
+ *
+ * Miscellaneous util functions
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifdef XCP
+#include "postgres.h"
+#include "miscadmin.h"
+
+#include "utils/builtins.h"
+#include "../interfaces/libpq/libpq-fe.h"
+#include "commands/dbcommands.h"
+
+/*
+ * stormdb_promote_standby:
+ *
+ * Promote a standby into a regular backend by touching the trigger file. We
+ * cannot do it from outside via a normal shell script because this function
+ * needs to be called in context of the operation that is moving the node.
+ * Providing a function call provides some sense of transactional atomicity
+ */
+Datum
+stormdb_promote_standby(PG_FUNCTION_ARGS)
+{
+ char trigger_file[MAXPGPATH];
+ FILE *fp;
+
+ snprintf(trigger_file, MAXPGPATH, "%s/stormdb.failover", DataDir);
+
+ if ((fp = fopen(trigger_file, "w")) == NULL)
+ ereport(ERROR,
+ (errmsg("could not create trigger file"),
+ errdetail("The trigger file path was: %s",
+ trigger_file)));
+ fclose(fp);
+
+ PG_RETURN_VOID();
+}
+#endif
diff --git a/src/backend/pgxc/copy/remotecopy.c b/src/backend/pgxc/copy/remotecopy.c
index 016ea1425b..2422f25de8 100644
--- a/src/backend/pgxc/copy/remotecopy.c
+++ b/src/backend/pgxc/copy/remotecopy.c
@@ -3,6 +3,11 @@
* remotecopy.c
* Implements an extension of COPY command for remote management
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012, Postgres-XC Development Group
*
@@ -16,13 +21,16 @@
#include "postgres.h"
#include "miscadmin.h"
#include "lib/stringinfo.h"
-#include "optimizer/pgxcship.h"
#include "optimizer/planner.h"
#include "pgxc/pgxcnode.h"
+#include "pgxc/postgresql_fdw.h"
#include "pgxc/remotecopy.h"
#include "rewrite/rewriteHandler.h"
#include "utils/builtins.h"
#include "utils/rel.h"
+#ifdef PGXC
+#include "utils/lsyscache.h"
+#endif
static void RemoteCopy_QuoteStr(StringInfo query_buf, char *value);
@@ -37,7 +45,11 @@ RemoteCopy_GetRelationLoc(RemoteCopyData *state,
Relation rel,
List *attnums)
{
+#ifndef XCP
+ ExecNodes *exec_nodes = makeNode(ExecNodes);
+#else
ExecNodes *exec_nodes = NULL;
+#endif
/*
* If target table does not exists on nodes (e.g. system table)
@@ -46,6 +58,23 @@ RemoteCopy_GetRelationLoc(RemoteCopyData *state,
*/
state->rel_loc = GetRelationLocInfo(RelationGetRelid(rel));
+#ifdef XCP
+ if (state->rel_loc &&
+ AttributeNumberIsValid(state->rel_loc->partAttrNum))
+ {
+ TupleDesc tdesc;
+ Form_pg_attribute pattr;
+ /* determine distribution column data type */
+ tdesc = RelationGetDescr(rel);
+
+ pattr = tdesc->attrs[state->rel_loc->partAttrNum - 1];
+ state->dist_type = pattr->atttypid;
+ }
+ else
+ state->dist_type = InvalidOid;
+
+ state->locator = NULL;
+#else
if (state->rel_loc)
{
/*
@@ -55,7 +84,7 @@ RemoteCopy_GetRelationLoc(RemoteCopyData *state,
*/
exec_nodes = makeNode(ExecNodes);
if (!state->is_from &&
- IsRelationReplicated(state->rel_loc))
+ IsLocatorReplicated(state->rel_loc->locatorType))
exec_nodes->nodeList = GetPreferredReplicationNode(state->rel_loc->nodeList);
else
{
@@ -96,6 +125,7 @@ RemoteCopy_GetRelationLoc(RemoteCopyData *state,
/* Then save obtained result */
state->exec_nodes = exec_nodes;
+#endif
}
/*
@@ -119,8 +149,18 @@ RemoteCopy_BuildStatement(RemoteCopyData *state,
*/
initStringInfo(&state->query_buf);
appendStringInfoString(&state->query_buf, "COPY ");
- appendStringInfo(&state->query_buf, "%s",
- quote_identifier(RelationGetRelationName(rel)));
+
+ /*
+ * The table name should be qualified, unless the table is a temporary table
+ */
+ if (rel->rd_backend == MyBackendId)
+ appendStringInfo(&state->query_buf, "%s",
+ quote_identifier(RelationGetRelationName(rel)));
+ else
+ appendStringInfo(&state->query_buf, "%s",
+ quote_qualified_identifier(
+ get_namespace_name(RelationGetNamespace(rel)),
+ RelationGetRelationName(rel)));
if (attnamelist)
{
@@ -308,16 +348,19 @@ FreeRemoteCopyData(RemoteCopyData *state)
/* Leave if nothing */
if (state == NULL)
return;
-
+#ifdef XCP
+ if (state->locator)
+ freeLocator(state->locator);
+#else
if (state->connections)
pfree(state->connections);
+#endif
if (state->query_buf.data)
pfree(state->query_buf.data);
FreeRelationLocInfo(state->rel_loc);
pfree(state);
}
-
#define APPENDSOFAR(query_buf, start, current) \
if (current > start) \
appendBinaryStringInfo(query_buf, start, current - start)
diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index 432ae502be..ff531649cb 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -6,6 +6,11 @@
*
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
@@ -49,14 +54,63 @@
#include "catalog/pgxc_node.h"
#include "catalog/namespace.h"
#include "access/hash.h"
+#ifdef XCP
+#include "utils/date.h"
+#include "utils/memutils.h"
+
+/*
+ * Locator details are private
+ */
+struct _Locator
+{
+ /*
+ * Determine target nodes for value.
+ * Resulting nodes are stored to the results array.
+ * Function returns number of node references written to the array.
+ */
+ int (*locatefunc) (Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+ Oid dataType; /* values of that type are passed to locateNodes function */
+ LocatorListType listType;
+ bool primary;
+ /* locator-specific data */
+ /* XXX: move them into union ? */
+ int roundRobinNode; /* for LOCATOR_TYPE_RROBIN */
+ LocatorHashFunc hashfunc; /* for LOCATOR_TYPE_HASH */
+ int valuelen; /* 1, 2 or 4 for LOCATOR_TYPE_MODULO */
+
+ int nodeCount; /* How many nodes are in the map */
+ void *nodeMap; /* map index to node reference according to listType */
+ void *results; /* array to output results */
+};
+#endif
-static Expr *pgxc_find_distcol_expr(Index varno, AttrNumber attrNum,
+#ifndef XCP
+static Expr *pgxc_find_distcol_expr(Index varno, PartAttrNumber partAttrNum,
Node *quals);
+#endif
Oid primary_data_node = InvalidOid;
int num_preferred_data_nodes = 0;
Oid preferred_data_node[MAX_PREFERRED_NODES];
+#ifdef XCP
+static int modulo_value_len(Oid dataType);
+static LocatorHashFunc hash_func_ptr(Oid dataType);
+static int locate_static(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static int locate_roundrobin(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static int locate_hash_insert(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static int locate_hash_select(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static int locate_modulo_insert(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+static int locate_modulo_select(Locator *self, Datum value, bool isnull,
+ bool *hasprimary);
+#endif
+
static const unsigned int xc_mod_m[] =
{
0x00000000, 0x55555555, 0x33333333, 0xc71c71c7,
@@ -120,6 +174,59 @@ static const unsigned int xc_mod_r[][6] =
{0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}
};
+
+#ifdef XCP
+/*
+ * GetAnyDataNode
+ * Pick any data node from given set, but try a preferred node
+ */
+int
+GetAnyDataNode(Bitmapset *nodes)
+{
+ Bitmapset *preferred = NULL;
+ int i, nodeid;
+ int nmembers = 0;
+ int members[NumDataNodes];
+
+ for (i = 0; i < num_preferred_data_nodes; i++)
+ {
+ char ntype = PGXC_NODE_DATANODE;
+ nodeid = PGXCNodeGetNodeId(preferred_data_node[i], &ntype);
+
+ /* OK, found one */
+ if (bms_is_member(nodeid, nodes))
+ preferred = bms_add_member(preferred, nodeid);
+ }
+
+ /*
+ * If no preferred data nodes or they are not in the desired set, pick up
+ * from the original set.
+ */
+ if (bms_is_empty(preferred))
+ preferred = bms_copy(nodes);
+
+ /*
+ * Load balance.
+ * We can not get item from the set, convert it to array
+ */
+ while ((nodeid = bms_first_member(preferred)) >= 0)
+ members[nmembers++] = nodeid;
+ bms_free(preferred);
+
+ /* If there is a single member nothing to balance */
+ if (nmembers == 1)
+ return members[0];
+
+ /*
+ * In general, the set may contain any number of nodes, and if we save
+ * previous returned index for load balancing the distribution won't be
+ * flat, because small set will probably reset saved value, and lower
+ * indexes will be picked up more often.
+ * So we just get a random value from 0..nmembers-1.
+ */
+ return members[((unsigned int) random()) % nmembers];
+}
+#else
/*
* GetPreferredReplicationNode
* Pick any Datanode from given list, however fetch a preferred node first.
@@ -127,31 +234,39 @@ static const unsigned int xc_mod_r[][6] =
List *
GetPreferredReplicationNode(List *relNodes)
{
- ListCell *item;
- int nodeid = -1;
-
- if (list_length(relNodes) <= 0)
- elog(ERROR, "a list of nodes should have at least one node");
-
- foreach(item, relNodes)
+ /*
+ * Try to find the first node in given list relNodes
+ * that is in the list of preferred nodes
+ */
+ if (num_preferred_data_nodes != 0)
{
- int cnt_nodes;
- for (cnt_nodes = 0;
- cnt_nodes < num_preferred_data_nodes && nodeid < 0;
- cnt_nodes++)
+ ListCell *item;
+ foreach(item, relNodes)
{
- if (PGXCNodeGetNodeId(preferred_data_node[cnt_nodes],
- PGXC_NODE_DATANODE) == lfirst_int(item))
- nodeid = lfirst_int(item);
+ int relation_nodeid = lfirst_int(item);
+ int i;
+ for (i = 0; i < num_preferred_data_nodes; i++)
+ {
+#ifdef XCP
+ char nodetype = PGXC_NODE_DATANODE;
+ int nodeid = PGXCNodeGetNodeId(preferred_data_node[i],
+ &nodetype);
+#else
+ int nodeid = PGXCNodeGetNodeId(preferred_data_node[i], PGXC_NODE_DATANODE);
+#endif
+
+ /* OK, found one */
+ if (nodeid == relation_nodeid)
+ return lappend_int(NULL, nodeid);
+ }
}
- if (nodeid >= 0)
- break;
}
- if (nodeid < 0)
- return list_make1_int(linitial_int(relNodes));
- return list_make1_int(nodeid);
+ /* Nothing found? Return the first one in relation node list */
+ return lappend_int(NULL, linitial_int(relNodes));
}
+#endif
+
/*
* compute_modulo
@@ -206,6 +321,7 @@ compute_modulo(unsigned int numerator, unsigned int denominator)
return numerator % denominator;
}
+#ifndef XCP
/*
* get_node_from_modulo - determine node based on modulo
*
@@ -219,57 +335,157 @@ get_node_from_modulo(int modulo, List *nodeList)
return list_nth_int(nodeList, modulo);
}
+#endif
/*
- * GetRelationDistribColumn
- * Return hash column name for relation or NULL if relation is not distributed.
+ * GetRelationDistColumn - Returns the name of the hash or modulo distribution column
+ * First hash distribution is checked
+ * Retuens NULL if the table is neither hash nor modulo distributed
*/
char *
-GetRelationDistribColumn(RelationLocInfo *locInfo)
+GetRelationDistColumn(RelationLocInfo * rel_loc_info)
{
- /* No relation, so simply leave */
- if (!locInfo)
- return NULL;
+char *pColName;
- /* No distribution column if relation is not distributed with a key */
- if (!IsRelationDistributedByValue(locInfo))
- return NULL;
+ pColName = NULL;
- /* Return column name */
- return get_attname(locInfo->relid, locInfo->partAttrNum);
+ pColName = GetRelationHashColumn(rel_loc_info);
+ if (pColName == NULL)
+ pColName = GetRelationModuloColumn(rel_loc_info);
+
+ return pColName;
}
+/*
+ * Returns whether or not the data type is hash distributable with PG-XC
+ * PGXCTODO - expand support for other data types!
+ */
+bool
+IsTypeHashDistributable(Oid col_type)
+{
+#ifdef XCP
+ return (hash_func_ptr(col_type) != NULL);
+#else
+ if(col_type == INT8OID
+ || col_type == INT2OID
+ || col_type == OIDOID
+ || col_type == INT4OID
+ || col_type == BOOLOID
+ || col_type == CHAROID
+ || col_type == NAMEOID
+ || col_type == INT2VECTOROID
+ || col_type == TEXTOID
+ || col_type == OIDVECTOROID
+ || col_type == FLOAT4OID
+ || col_type == FLOAT8OID
+ || col_type == ABSTIMEOID
+ || col_type == RELTIMEOID
+ || col_type == CASHOID
+ || col_type == BPCHAROID
+ || col_type == BYTEAOID
+ || col_type == VARCHAROID
+ || col_type == DATEOID
+ || col_type == TIMEOID
+ || col_type == TIMESTAMPOID
+ || col_type == TIMESTAMPTZOID
+ || col_type == INTERVALOID
+ || col_type == TIMETZOID
+ || col_type == NUMERICOID
+ )
+ return true;
+
+ return false;
+#endif
+}
/*
- * IsDistribColumn
- * Return whether column for relation is used for distribution or not.
+ * GetRelationHashColumn - return hash column for relation.
+ *
+ * Returns NULL if the relation is not hash partitioned.
+ */
+char *
+GetRelationHashColumn(RelationLocInfo * rel_loc_info)
+{
+ char *column_str = NULL;
+
+ if (rel_loc_info == NULL)
+ column_str = NULL;
+ else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
+ column_str = NULL;
+ else
+ {
+ int len = strlen(rel_loc_info->partAttrName);
+
+ column_str = (char *) palloc(len + 1);
+ strncpy(column_str, rel_loc_info->partAttrName, len + 1);
+ }
+
+ return column_str;
+}
+
+/*
+ * IsHashColumn - return whether or not column for relation is hashed.
+ *
*/
bool
-IsDistribColumn(Oid relid, AttrNumber attNum)
+IsHashColumn(RelationLocInfo *rel_loc_info, char *part_col_name)
{
- RelationLocInfo *locInfo = GetRelationLocInfo(relid);
+ bool ret_value = false;
- /* No locator info, so leave */
- if (!locInfo)
- return false;
+ if (!rel_loc_info || !part_col_name)
+ ret_value = false;
+ else if (rel_loc_info->locatorType != LOCATOR_TYPE_HASH)
+ ret_value = false;
+ else
+ ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName);
- /* No distribution column if relation is not distributed with a key */
- if (!IsRelationDistributedByValue(locInfo))
- return false;
+ return ret_value;
+}
- /* Finally check if attribute is distributed */
- return locInfo->partAttrNum == attNum;
+
+/*
+ * IsHashColumnForRelId - return whether or not column for relation is hashed.
+ *
+ */
+bool
+IsHashColumnForRelId(Oid relid, char *part_col_name)
+{
+ RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid);
+
+ return IsHashColumn(rel_loc_info, part_col_name);
+}
+
+/*
+ * IsDistColumnForRelId - return whether or not column for relation is used for hash or modulo distribution
+ *
+ */
+bool
+IsDistColumnForRelId(Oid relid, char *part_col_name)
+{
+ bool bRet;
+ RelationLocInfo *rel_loc_info;
+
+ rel_loc_info = GetRelationLocInfo(relid);
+ bRet = false;
+
+ bRet = IsHashColumn(rel_loc_info, part_col_name);
+ if (bRet == false)
+ IsModuloColumn(rel_loc_info, part_col_name);
+ return bRet;
}
/*
- * IsTypeDistributable
- * Returns whether the data type is distributable using a column value.
+ * Returns whether or not the data type is modulo distributable with PG-XC
+ * PGXCTODO - expand support for other data types!
*/
bool
-IsTypeDistributable(Oid col_type)
+IsTypeModuloDistributable(Oid col_type)
{
+#ifdef XCP
+ return (modulo_value_len(col_type) != -1);
+#else
if(col_type == INT8OID
|| col_type == INT2OID
|| col_type == OIDOID
@@ -299,12 +515,68 @@ IsTypeDistributable(Oid col_type)
return true;
return false;
+#endif
+}
+
+/*
+ * GetRelationModuloColumn - return modulo column for relation.
+ *
+ * Returns NULL if the relation is not modulo partitioned.
+ */
+char *
+GetRelationModuloColumn(RelationLocInfo * rel_loc_info)
+{
+ char *column_str = NULL;
+
+ if (rel_loc_info == NULL)
+ column_str = NULL;
+ else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO)
+ column_str = NULL;
+ else
+ {
+ int len = strlen(rel_loc_info->partAttrName);
+
+ column_str = (char *) palloc(len + 1);
+ strncpy(column_str, rel_loc_info->partAttrName, len + 1);
+ }
+
+ return column_str;
+}
+
+/*
+ * IsModuloColumn - return whether or not column for relation is used for modulo distribution.
+ *
+ */
+bool
+IsModuloColumn(RelationLocInfo *rel_loc_info, char *part_col_name)
+{
+ bool ret_value = false;
+
+ if (!rel_loc_info || !part_col_name)
+ ret_value = false;
+ else if (rel_loc_info->locatorType != LOCATOR_TYPE_MODULO)
+ ret_value = false;
+ else
+ ret_value = !strcmp(part_col_name, rel_loc_info->partAttrName);
+
+ return ret_value;
}
/*
- * GetRoundRobinNode
- * Update the round robin node for the relation.
+ * IsModuloColumnForRelId - return whether or not column for relation is used for modulo distribution.
+ */
+bool
+IsModuloColumnForRelId(Oid relid, char *part_col_name)
+{
+ RelationLocInfo *rel_loc_info = GetRelationLocInfo(relid);
+
+ return IsModuloColumn(rel_loc_info, part_col_name);
+}
+
+/*
+ * Update the round robin node for the relation
+ *
* PGXCTODO - may not want to bother with locking here, we could track
* these in the session memory context instead...
*/
@@ -314,8 +586,13 @@ GetRoundRobinNode(Oid relid)
int ret_node;
Relation rel = relation_open(relid, AccessShareLock);
- Assert (rel->rd_locator_info->locatorType == LOCATOR_TYPE_REPLICATED ||
+#ifdef XCP
+ Assert (IsLocatorReplicated(rel->rd_locator_info->locatorType) ||
rel->rd_locator_info->locatorType == LOCATOR_TYPE_RROBIN);
+#else
+ Assert (rel->rd_locator_info->locatorType == LOCATOR_TYPE_REPLICATED ||
+ rel->rd_locator_info->locatorType == LOCATOR_TYPE_RROBIN);
+#endif
ret_node = lfirst_int(rel->rd_locator_info->roundRobinNode);
@@ -333,6 +610,7 @@ GetRoundRobinNode(Oid relid)
/*
* IsTableDistOnPrimary
+ *
* Does the table distribution list include the primary node?
*/
bool
@@ -342,13 +620,19 @@ IsTableDistOnPrimary(RelationLocInfo *rel_loc_info)
if (!OidIsValid(primary_data_node) ||
rel_loc_info == NULL ||
- list_length(rel_loc_info->nodeList) == 0)
+ list_length(rel_loc_info->nodeList = 0))
return false;
foreach(item, rel_loc_info->nodeList)
{
+#ifdef XCP
+ char ntype = PGXC_NODE_DATANODE;
+ if (PGXCNodeGetNodeId(primary_data_node, &ntype) == lfirst_int(item))
+ return true;
+#else
if (PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE) == lfirst_int(item))
return true;
+#endif
}
return false;
}
@@ -359,25 +643,24 @@ IsTableDistOnPrimary(RelationLocInfo *rel_loc_info)
* Check equality of given locator information
*/
bool
-IsLocatorInfoEqual(RelationLocInfo *locInfo1,
- RelationLocInfo *locInfo2)
+IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2)
{
List *nodeList1, *nodeList2;
- Assert(locInfo1 && locInfo2);
+ Assert(rel_loc_info1 && rel_loc_info2);
- nodeList1 = locInfo1->nodeList;
- nodeList2 = locInfo2->nodeList;
+ nodeList1 = rel_loc_info1->nodeList;
+ nodeList2 = rel_loc_info2->nodeList;
/* Same relation? */
- if (locInfo1->relid != locInfo2->relid)
+ if (rel_loc_info1->relid != rel_loc_info2->relid)
return false;
/* Same locator type? */
- if (locInfo1->locatorType != locInfo2->locatorType)
+ if (rel_loc_info1->locatorType != rel_loc_info2->locatorType)
return false;
/* Same attribute number? */
- if (locInfo1->partAttrNum != locInfo2->partAttrNum)
+ if (rel_loc_info1->partAttrNum != rel_loc_info2->partAttrNum)
return false;
/* Same node list? */
@@ -390,6 +673,7 @@ IsLocatorInfoEqual(RelationLocInfo *locInfo1,
}
+#ifndef XCP
/*
* GetRelationNodes
*
@@ -417,30 +701,22 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
long hashValue;
int modulo;
int nodeIndex;
+ int k;
if (rel_loc_info == NULL)
return NULL;
exec_nodes = makeNode(ExecNodes);
exec_nodes->baselocatortype = rel_loc_info->locatorType;
- exec_nodes->accesstype = accessType;
switch (rel_loc_info->locatorType)
{
case LOCATOR_TYPE_REPLICATED:
- /*
- * When intention is to read from replicated table, return all the
- * nodes so that planner can choose one depending upon the rest of
- * the JOIN tree. But while reading with update lock, we need to
- * read from the primary node (if exists) so as to avoid the
- * deadlock.
- * For write access set primary node (if exists).
- */
- exec_nodes->nodeList = list_copy(rel_loc_info->nodeList);
if (accessType == RELATION_ACCESS_UPDATE || accessType == RELATION_ACCESS_INSERT)
{
/* we need to write to all synchronously */
+ exec_nodes->nodeList = list_concat(exec_nodes->nodeList, rel_loc_info->nodeList);
/*
* Write to primary node first, to reduce chance of a deadlock
@@ -450,22 +726,57 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
&& exec_nodes->nodeList
&& list_length(exec_nodes->nodeList) > 1) /* make sure more than 1 */
{
- exec_nodes->primarynodelist = list_make1_int(PGXCNodeGetNodeId(primary_data_node,
- PGXC_NODE_DATANODE));
- exec_nodes->nodeList = list_delete_int(exec_nodes->nodeList,
- PGXCNodeGetNodeId(primary_data_node,
- PGXC_NODE_DATANODE));
+ exec_nodes->primarynodelist = lappend_int(NULL,
+ PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE));
+ list_delete_int(exec_nodes->nodeList,
+ PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE));
}
}
- else if (accessType == RELATION_ACCESS_READ_FOR_UPDATE &&
- IsTableDistOnPrimary(rel_loc_info))
+ else
{
/*
- * We should ensure row is locked on the primary node to
- * avoid distributed deadlock if updating the same row
- * concurrently
+ * In case there are nodes defined in location info, initialize node list
+ * with a default node being the first node in list.
+ * This node list may be changed if a better one is found afterwards.
*/
- exec_nodes->nodeList = list_make1_int(PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE));
+ if (rel_loc_info->nodeList)
+ exec_nodes->nodeList = lappend_int(NULL,
+ linitial_int(rel_loc_info->nodeList));
+
+ if (accessType == RELATION_ACCESS_READ_FOR_UPDATE &&
+ IsTableDistOnPrimary(rel_loc_info))
+ {
+ /*
+ * We should ensure row is locked on the primary node to
+ * avoid distributed deadlock if updating the same row
+ * concurrently
+ */
+ exec_nodes->nodeList = lappend_int(NULL,
+ PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE));
+ }
+ else if (num_preferred_data_nodes > 0)
+ {
+ ListCell *item;
+
+ foreach(item, rel_loc_info->nodeList)
+ {
+ for (k = 0; k < num_preferred_data_nodes; k++)
+ {
+ if (PGXCNodeGetNodeId(preferred_data_node[k],
+ PGXC_NODE_DATANODE) == lfirst_int(item))
+ {
+ exec_nodes->nodeList = lappend_int(NULL,
+ lfirst_int(item));
+ break;
+ }
+ }
+ }
+ }
+
+ /* If nothing found just read from one of them. Use round robin mechanism */
+ if (exec_nodes->nodeList == NULL)
+ exec_nodes->nodeList = lappend_int(NULL,
+ GetRoundRobinNode(rel_loc_info->relid));
}
break;
@@ -477,27 +788,37 @@ GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
rel_loc_info->locatorType);
modulo = compute_modulo(abs(hashValue), list_length(rel_loc_info->nodeList));
nodeIndex = get_node_from_modulo(modulo, rel_loc_info->nodeList);
- exec_nodes->nodeList = list_make1_int(nodeIndex);
+ exec_nodes->nodeList = lappend_int(NULL, nodeIndex);
}
else
{
if (accessType == RELATION_ACCESS_INSERT)
/* Insert NULL to first node*/
- exec_nodes->nodeList = list_make1_int(linitial_int(rel_loc_info->nodeList));
+ exec_nodes->nodeList = lappend_int(NULL, linitial_int(rel_loc_info->nodeList));
else
- exec_nodes->nodeList = list_copy(rel_loc_info->nodeList);
+ exec_nodes->nodeList = list_concat(exec_nodes->nodeList, rel_loc_info->nodeList);
}
break;
+ case LOCATOR_TYPE_SINGLE:
+ /* just return first (there should only be one) */
+ exec_nodes->nodeList = list_concat(exec_nodes->nodeList,
+ rel_loc_info->nodeList);
+ break;
+
case LOCATOR_TYPE_RROBIN:
- /*
- * round robin, get next one in case of insert. If not insert, all
- * node needed
- */
+ /* round robin, get next one */
if (accessType == RELATION_ACCESS_INSERT)
- exec_nodes->nodeList = list_make1_int(GetRoundRobinNode(rel_loc_info->relid));
+ {
+ /* write to just one of them */
+ exec_nodes->nodeList = lappend_int(NULL, GetRoundRobinNode(rel_loc_info->relid));
+ }
else
- exec_nodes->nodeList = list_copy(rel_loc_info->nodeList);
+ {
+ /* we need to read from all */
+ exec_nodes->nodeList = list_concat(exec_nodes->nodeList,
+ rel_loc_info->nodeList);
+ }
break;
/* PGXCTODO case LOCATOR_TYPE_RANGE: */
@@ -534,7 +855,7 @@ GetRelationNodesByQuals(Oid reloid, Index varno, Node *quals,
* If the table distributed by value, check if we can reduce the Datanodes
* by looking at the qualifiers for this relation
*/
- if (IsRelationDistributedByValue(rel_loc_info))
+ if (IsLocatorDistributedByValue(rel_loc_info->locatorType))
{
Oid disttype = get_atttype(reloid, rel_loc_info->partAttrNum);
int32 disttypmod = get_atttypmod(reloid, rel_loc_info->partAttrNum);
@@ -584,26 +905,62 @@ GetRelationNodesByQuals(Oid reloid, Index varno, Node *quals,
relaccess);
return exec_nodes;
}
+#endif
/*
- * GetLocatorType
- * Returns the locator type of the table.
+ * ConvertToLocatorType
+ * get locator distribution type
+ * We really should just have pgxc_class use disttype instead...
+ */
+char
+ConvertToLocatorType(int disttype)
+{
+ char loctype = LOCATOR_TYPE_NONE;
+
+ switch (disttype)
+ {
+ case DISTTYPE_HASH:
+ loctype = LOCATOR_TYPE_HASH;
+ break;
+ case DISTTYPE_ROUNDROBIN:
+ loctype = LOCATOR_TYPE_RROBIN;
+ break;
+ case DISTTYPE_REPLICATION:
+ loctype = LOCATOR_TYPE_REPLICATED;
+ break;
+ case DISTTYPE_MODULO:
+ loctype = LOCATOR_TYPE_MODULO;
+ break;
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("Invalid distribution type")));
+ break;
+ }
+
+ return loctype;
+}
+
+
+/*
+ * GetLocatorType - Returns the locator type of the table
+ *
*/
char
GetLocatorType(Oid relid)
{
- char ret = LOCATOR_TYPE_NONE;
- RelationLocInfo *locInfo = GetRelationLocInfo(relid);
+ char ret = '\0';
- if (locInfo != NULL)
- ret = locInfo->locatorType;
+ RelationLocInfo *ret_loc_info = GetRelationLocInfo(relid);
+
+ if (ret_loc_info != NULL)
+ ret = ret_loc_info->locatorType;
return ret;
}
/*
- * GetAllDataNodes
* Return a list of all Datanodes.
* We assume all tables use all nodes in the prototype, so just return a list
* from first one.
@@ -621,7 +978,6 @@ GetAllDataNodes(void)
}
/*
- * GetAllCoordNodes
* Return a list of all Coordinators
* This is used to send DDL to all nodes and to clean up pooler connections.
* Do not put in the list the local Coordinator where this function is launched.
@@ -648,7 +1004,6 @@ GetAllCoordNodes(void)
/*
- * RelationBuildLocator
* Build locator information associated with the specified relation.
*/
void
@@ -693,12 +1048,24 @@ RelationBuildLocator(Relation rel)
relationLocInfo->locatorType = pgxc_class->pclocatortype;
relationLocInfo->partAttrNum = pgxc_class->pcattnum;
+
+ relationLocInfo->partAttrName = get_attname(relationLocInfo->relid, pgxc_class->pcattnum);
+
relationLocInfo->nodeList = NIL;
+#ifdef XCP
+ for (j = 0; j < pgxc_class->nodeoids.dim1; j++)
+ {
+ char ntype = PGXC_NODE_DATANODE;
+ int nid = PGXCNodeGetNodeId(pgxc_class->nodeoids.values[j], &ntype);
+ relationLocInfo->nodeList = lappend_int(relationLocInfo->nodeList, nid);
+ }
+#else
for (j = 0; j < pgxc_class->nodeoids.dim1; j++)
relationLocInfo->nodeList = lappend_int(relationLocInfo->nodeList,
PGXCNodeGetNodeId(pgxc_class->nodeoids.values[j],
PGXC_NODE_DATANODE));
+#endif
/*
* If the locator type is round robin, we set a node to
@@ -706,7 +1073,11 @@ RelationBuildLocator(Relation rel)
* we choose a node to use for balancing reads.
*/
if (relationLocInfo->locatorType == LOCATOR_TYPE_RROBIN
+#ifdef XCP
+ || IsLocatorReplicated(relationLocInfo->locatorType))
+#else
|| relationLocInfo->locatorType == LOCATOR_TYPE_REPLICATED)
+#endif
{
int offset;
/*
@@ -728,8 +1099,7 @@ RelationBuildLocator(Relation rel)
}
/*
- * GetLocatorRelationInfo
- * Returns the locator information for relation,
+ * GetLocatorRelationInfo - Returns the locator information for relation,
* in a copy of the RelationLocatorInfo struct in relcache
*/
RelationLocInfo *
@@ -750,43 +1120,61 @@ GetRelationLocInfo(Oid relid)
}
/*
- * CopyRelationLocInfo
+ * Get the distribution type of relation.
+ */
+char
+GetRelationLocType(Oid relid)
+{
+ RelationLocInfo *locinfo = GetRelationLocInfo(relid);
+ if (!locinfo)
+ return LOCATOR_TYPE_NONE;
+
+ return locinfo->locatorType;
+}
+
+/*
* Copy the RelationLocInfo struct
*/
RelationLocInfo *
-CopyRelationLocInfo(RelationLocInfo *srcInfo)
+CopyRelationLocInfo(RelationLocInfo * src_info)
{
- RelationLocInfo *destInfo;
+ RelationLocInfo *dest_info;
+
+ Assert(src_info);
- Assert(srcInfo);
- destInfo = (RelationLocInfo *) palloc0(sizeof(RelationLocInfo));
+ dest_info = (RelationLocInfo *) palloc0(sizeof(RelationLocInfo));
- destInfo->relid = srcInfo->relid;
- destInfo->locatorType = srcInfo->locatorType;
- destInfo->partAttrNum = srcInfo->partAttrNum;
- if (srcInfo->nodeList)
- destInfo->nodeList = list_copy(srcInfo->nodeList);
+ dest_info->relid = src_info->relid;
+ dest_info->locatorType = src_info->locatorType;
+ dest_info->partAttrNum = src_info->partAttrNum;
+ if (src_info->partAttrName)
+ dest_info->partAttrName = pstrdup(src_info->partAttrName);
- /* Note: for roundrobin, we use the relcache entry */
- return destInfo;
+ if (src_info->nodeList)
+ dest_info->nodeList = list_copy(src_info->nodeList);
+ /* Note, for round robin, we use the relcache entry */
+
+ return dest_info;
}
/*
- * FreeRelationLocInfo
* Free RelationLocInfo struct
*/
void
FreeRelationLocInfo(RelationLocInfo *relationLocInfo)
{
if (relationLocInfo)
+ {
+ if (relationLocInfo->partAttrName)
+ pfree(relationLocInfo->partAttrName);
pfree(relationLocInfo);
+ }
}
+
/*
- * FreeExecNodes
- * Free the contents of the ExecNodes expression
- */
+ * Free the contents of the ExecNodes expression */
void
FreeExecNodes(ExecNodes **exec_nodes)
{
@@ -801,6 +1189,699 @@ FreeExecNodes(ExecNodes **exec_nodes)
*exec_nodes = NULL;
}
+
+#ifdef XCP
+/*
+ * Determine value length in bytes for specified type for a module locator.
+ * Return -1 if module locator is not supported for the type.
+ */
+static int
+modulo_value_len(Oid dataType)
+{
+ switch (dataType)
+ {
+ case BOOLOID:
+ case CHAROID:
+ return 1;
+ case INT2OID:
+ return 2;
+ case INT4OID:
+ case ABSTIMEOID:
+ case RELTIMEOID:
+ case DATEOID:
+ return 4;
+ default:
+ return -1;
+ }
+}
+
+
+static LocatorHashFunc
+hash_func_ptr(Oid dataType)
+{
+ switch (dataType)
+ {
+ case INT8OID:
+ case CASHOID:
+ return hashint8;
+ case INT2OID:
+ return hashint2;
+ case OIDOID:
+ return hashoid;
+ case INT4OID:
+ case ABSTIMEOID:
+ case RELTIMEOID:
+ case DATEOID:
+ return hashint4;
+ case BOOLOID:
+ case CHAROID:
+ return hashchar;
+ case NAMEOID:
+ return hashname;
+ case INT2VECTOROID:
+ return hashint2vector;
+ case VARCHAROID:
+ case TEXTOID:
+ return hashtext;
+ case OIDVECTOROID:
+ return hashoidvector;
+ case BPCHAROID:
+ return hashbpchar;
+ case BYTEAOID:
+ return hashvarlena;
+ case TIMEOID:
+ return time_hash;
+ case TIMESTAMPOID:
+ case TIMESTAMPTZOID:
+ return timestamp_hash;
+ case INTERVALOID:
+ return interval_hash;
+ case TIMETZOID:
+ return timetz_hash;
+ case NUMERICOID:
+ return hash_numeric;
+ case UUIDOID:
+ return uuid_hash;
+ default:
+ return NULL;
+ }
+}
+
+
+Locator *
+createLocator(char locatorType, RelationAccessType accessType,
+ Oid dataType, LocatorListType listType, int nodeCount,
+ void *nodeList, void **result, bool primary)
+{
+ Locator *locator;
+ ListCell *lc;
+ void *nodeMap;
+ int i;
+
+ locator = (Locator *) palloc(sizeof(Locator));
+ locator->dataType = dataType;
+ locator->listType = listType;
+ locator->nodeCount = nodeCount;
+ /* Create node map */
+ switch (listType)
+ {
+ case LOCATOR_LIST_NONE:
+ /* No map, return indexes */
+ nodeMap = NULL;
+ break;
+ case LOCATOR_LIST_INT:
+ /* Copy integer array */
+ nodeMap = palloc(nodeCount * sizeof(int));
+ memcpy(nodeMap, nodeList, nodeCount * sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ /* Copy array of Oids */
+ nodeMap = palloc(nodeCount * sizeof(Oid));
+ memcpy(nodeMap, nodeList, nodeCount * sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ /* Copy array of Oids */
+ nodeMap = palloc(nodeCount * sizeof(void *));
+ memcpy(nodeMap, nodeList, nodeCount * sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Create map from list */
+ {
+ List *l = (List *) nodeList;
+ locator->nodeCount = list_length(l);
+ if (IsA(l, IntList))
+ {
+ int *intptr;
+ nodeMap = palloc(locator->nodeCount * sizeof(int));
+ intptr = (int *) nodeMap;
+ foreach(lc, l)
+ *intptr++ = lfirst_int(lc);
+ locator->listType = LOCATOR_LIST_INT;
+ }
+ else if (IsA(l, OidList))
+ {
+ Oid *oidptr;
+ nodeMap = palloc(locator->nodeCount * sizeof(Oid));
+ oidptr = (Oid *) nodeMap;
+ foreach(lc, l)
+ *oidptr++ = lfirst_oid(lc);
+ locator->listType = LOCATOR_LIST_OID;
+ }
+ else if (IsA(l, List))
+ {
+ void **voidptr;
+ nodeMap = palloc(locator->nodeCount * sizeof(void *));
+ voidptr = (void **) nodeMap;
+ foreach(lc, l)
+ *voidptr++ = lfirst(lc);
+ locator->listType = LOCATOR_LIST_POINTER;
+ }
+ else
+ {
+ /* can not get here */
+ Assert(false);
+ }
+ break;
+ }
+ }
+ /*
+ * Determine locatefunc, allocate results, set up parameters
+ * specific to locator type
+ */
+ switch (locatorType)
+ {
+ case LOCATOR_TYPE_REPLICATED:
+ if (accessType == RELATION_ACCESS_INSERT ||
+ accessType == RELATION_ACCESS_UPDATE)
+ {
+ locator->locatefunc = locate_static;
+ if (nodeMap == NULL)
+ {
+ /* no map, prepare array with indexes */
+ int *intptr;
+ nodeMap = palloc(locator->nodeCount * sizeof(int));
+ intptr = (int *) nodeMap;
+ for (i = 0; i < locator->nodeCount; i++)
+ *intptr++ = i;
+ }
+ locator->nodeMap = nodeMap;
+ locator->results = nodeMap;
+ }
+ else
+ {
+ locator->locatefunc = locate_roundrobin;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ locator->roundRobinNode = -1;
+ }
+ break;
+ case LOCATOR_TYPE_RROBIN:
+ if (accessType == RELATION_ACCESS_INSERT)
+ {
+ locator->locatefunc = locate_roundrobin;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ locator->roundRobinNode = -1;
+ }
+ else
+ {
+ locator->locatefunc = locate_static;
+ if (nodeMap == NULL)
+ {
+ /* no map, prepare array with indexes */
+ int *intptr;
+ nodeMap = palloc(locator->nodeCount * sizeof(int));
+ intptr = (int *) nodeMap;
+ for (i = 0; i < locator->nodeCount; i++)
+ *intptr++ = i;
+ }
+ locator->nodeMap = nodeMap;
+ locator->results = nodeMap;
+ }
+ break;
+ case LOCATOR_TYPE_HASH:
+ if (accessType == RELATION_ACCESS_INSERT)
+ {
+ locator->locatefunc = locate_hash_insert;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ }
+ else
+ {
+ locator->locatefunc = locate_hash_select;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(locator->nodeCount * sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(locator->nodeCount * sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(locator->nodeCount * sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ }
+
+ locator->hashfunc = hash_func_ptr(dataType);
+ if (locator->hashfunc == NULL)
+ ereport(ERROR, (errmsg("Error: unsupported data type for HASH locator: %d\n",
+ dataType)));
+ break;
+ case LOCATOR_TYPE_MODULO:
+ if (accessType == RELATION_ACCESS_INSERT)
+ {
+ locator->locatefunc = locate_modulo_insert;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ }
+ else
+ {
+ locator->locatefunc = locate_modulo_select;
+ locator->nodeMap = nodeMap;
+ switch (locator->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ case LOCATOR_LIST_INT:
+ locator->results = palloc(locator->nodeCount * sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ locator->results = palloc(locator->nodeCount * sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ locator->results = palloc(locator->nodeCount * sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ }
+
+ locator->valuelen = modulo_value_len(dataType);
+ if (locator->valuelen == -1)
+ ereport(ERROR, (errmsg("Error: unsupported data type for MODULO locator: %d\n",
+ dataType)));
+ break;
+ default:
+ ereport(ERROR, (errmsg("Error: no such supported locator type: %c\n",
+ locatorType)));
+ }
+
+ if (result)
+ *result = locator->results;
+
+ return locator;
+}
+
+
+void
+freeLocator(Locator *locator)
+{
+ pfree(locator->nodeMap);
+ /*
+ * locator->nodeMap and locator->results may point to the same memory,
+ * do not free it twice
+ */
+ if (locator->results != locator->nodeMap)
+ pfree(locator->results);
+ pfree(locator);
+}
+
+
+/*
+ * Each time return the same predefined results
+ */
+static int
+locate_static(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ /* TODO */
+ if (hasprimary)
+ *hasprimary = false;
+ return self->nodeCount;
+}
+
+
+/*
+ * Each time return one next node, in round robin manner
+ */
+static int
+locate_roundrobin(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ /* TODO */
+ if (hasprimary)
+ *hasprimary = false;
+ if (++self->roundRobinNode >= self->nodeCount)
+ self->roundRobinNode = 0;
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ ((int *) self->results)[0] = self->roundRobinNode;
+ break;
+ case LOCATOR_LIST_INT:
+ ((int *) self->results)[0] =
+ ((int *) self->nodeMap)[self->roundRobinNode];
+ break;
+ case LOCATOR_LIST_OID:
+ ((Oid *) self->results)[0] =
+ ((Oid *) self->nodeMap)[self->roundRobinNode];
+ break;
+ case LOCATOR_LIST_POINTER:
+ ((void **) self->results)[0] =
+ ((void **) self->nodeMap)[self->roundRobinNode];
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return 1;
+}
+
+
+/*
+ * Calculate hash from supplied value and use modulo by nodeCount as an index
+ */
+static int
+locate_hash_insert(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ int index;
+ if (hasprimary)
+ *hasprimary = false;
+ if (isnull)
+ index = 0;
+ else
+ {
+ unsigned int hash32;
+
+ hash32 = (unsigned int) DatumGetInt32(DirectFunctionCall1(self->hashfunc, value));
+
+ index = compute_modulo(hash32, self->nodeCount);
+ }
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ ((int *) self->results)[0] = index;
+ break;
+ case LOCATOR_LIST_INT:
+ ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_OID:
+ ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_POINTER:
+ ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return 1;
+}
+
+
+/*
+ * Calculate hash from supplied value and use modulo by nodeCount as an index
+ * if value is NULL assume no hint and return all the nodes.
+ */
+static int
+locate_hash_select(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ if (hasprimary)
+ *hasprimary = false;
+ if (isnull)
+ {
+ int i;
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ for (i = 0; i < self->nodeCount; i++)
+ ((int *) self->results)[i] = i;
+ break;
+ case LOCATOR_LIST_INT:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return self->nodeCount;
+ }
+ else
+ {
+ unsigned int hash32;
+ int index;
+
+ hash32 = (unsigned int) DatumGetInt32(DirectFunctionCall1(self->hashfunc, value));
+
+ index = compute_modulo(hash32, self->nodeCount);
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ ((int *) self->results)[0] = index;
+ break;
+ case LOCATOR_LIST_INT:
+ ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_OID:
+ ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_POINTER:
+ ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return 1;
+ }
+}
+
+
+/*
+ * Use modulo of supplied value by nodeCount as an index
+ */
+static int
+locate_modulo_insert(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ int index;
+ if (hasprimary)
+ *hasprimary = false;
+ if (isnull)
+ index = 0;
+ else
+ {
+ unsigned int mod32;
+
+ if (self->valuelen == 4)
+ mod32 = (unsigned int) (GET_4_BYTES(value));
+ else if (self->valuelen == 2)
+ mod32 = (unsigned int) (GET_2_BYTES(value));
+ else if (self->valuelen == 1)
+ mod32 = (unsigned int) (GET_1_BYTE(value));
+ else
+ mod32 = 0;
+
+ index = compute_modulo(mod32, self->nodeCount);
+ }
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ ((int *) self->results)[0] = index;
+ break;
+ case LOCATOR_LIST_INT:
+ ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_OID:
+ ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_POINTER:
+ ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return 1;
+}
+
+
+/*
+ * Use modulo of supplied value by nodeCount as an index
+ * if value is NULL assume no hint and return all the nodes.
+ */
+static int
+locate_modulo_select(Locator *self, Datum value, bool isnull,
+ bool *hasprimary)
+{
+ if (hasprimary)
+ *hasprimary = false;
+ if (isnull)
+ {
+ int i;
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ for (i = 0; i < self->nodeCount; i++)
+ ((int *) self->results)[i] = i;
+ break;
+ case LOCATOR_LIST_INT:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(int));
+ break;
+ case LOCATOR_LIST_OID:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(Oid));
+ break;
+ case LOCATOR_LIST_POINTER:
+ memcpy(self->results, self->nodeMap,
+ self->nodeCount * sizeof(void *));
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return self->nodeCount;
+ }
+ else
+ {
+ unsigned int mod32;
+ int index;
+
+ if (self->valuelen == 4)
+ mod32 = (unsigned int) (GET_4_BYTES(value));
+ else if (self->valuelen == 2)
+ mod32 = (unsigned int) (GET_2_BYTES(value));
+ else if (self->valuelen == 1)
+ mod32 = (unsigned int) (GET_1_BYTE(value));
+ else
+ mod32 = 0;
+
+ index = compute_modulo(mod32, self->nodeCount);
+
+ switch (self->listType)
+ {
+ case LOCATOR_LIST_NONE:
+ ((int *) self->results)[0] = index;
+ break;
+ case LOCATOR_LIST_INT:
+ ((int *) self->results)[0] = ((int *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_OID:
+ ((Oid *) self->results)[0] = ((Oid *) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_POINTER:
+ ((void **) self->results)[0] = ((void **) self->nodeMap)[index];
+ break;
+ case LOCATOR_LIST_LIST:
+ /* Should never happen */
+ Assert(false);
+ break;
+ }
+ return 1;
+ }
+}
+
+
+int
+GET_NODES(Locator *self, Datum value, bool isnull, bool *hasprimary)
+{
+ return (*self->locatefunc) (self, value, isnull, hasprimary);
+}
+
+
+void *
+getLocatorResults(Locator *self)
+{
+ return self->results;
+}
+
+
+void *
+getLocatorNodeMap(Locator *self)
+{
+ return self->nodeMap;
+}
+
+
+int
+getLocatorNodeCount(Locator *self)
+{
+ return self->nodeCount;
+}
+#endif
+
+
+#ifndef XCP
/*
* pgxc_find_distcol_expr
* Search through the quals provided and find out an expression which will give
@@ -814,23 +1895,12 @@ FreeExecNodes(ExecNodes **exec_nodes)
* this function returns NULL.
*/
static Expr *
-pgxc_find_distcol_expr(Index varno,
- AttrNumber attrNum,
- Node *quals)
+pgxc_find_distcol_expr(Index varno, PartAttrNumber partAttrNum,
+ Node *quals)
{
- List *lquals;
+ /* Convert the qualification into list of arguments of AND */
+ List *lquals = make_ands_implicit((Expr *)quals);
ListCell *qual_cell;
-
- /* If no quals, no distribution column expression */
- if (!quals)
- return NULL;
-
- /* Convert the qualification into List if it's not already so */
- if (!IsA(quals, List))
- lquals = make_ands_implicit((Expr *)quals);
- else
- lquals = (List *)quals;
-
/*
* For every ANDed expression, check if that expression is of the form
* <distribution_col> = <expr>. If so return expr.
@@ -888,7 +1958,7 @@ pgxc_find_distcol_expr(Index varno,
* If Var found is not the distribution column of required relation,
* check next qual
*/
- if (var_expr->varno != varno || var_expr->varattno != attrNum)
+ if (var_expr->varno != varno || var_expr->varattno != partAttrNum)
continue;
/*
* If the operator is not an assignment operator, check next
@@ -907,3 +1977,4 @@ pgxc_find_distcol_expr(Index varno,
/* Exhausted all quals, but no distribution column expression */
return NULL;
}
+#endif
diff --git a/src/backend/pgxc/locator/redistrib.c b/src/backend/pgxc/locator/redistrib.c
index c99bfe822e..98a6f6e355 100644
--- a/src/backend/pgxc/locator/redistrib.c
+++ b/src/backend/pgxc/locator/redistrib.c
@@ -159,8 +159,8 @@ pgxc_redist_build_replicate_to_distrib(RedistribState *distribState,
return;
/* Redistribution is done from replication to distributed (with value) */
- if (!IsRelationReplicated(oldLocInfo) ||
- !IsRelationDistributedByValue(newLocInfo))
+ if (!IsLocatorReplicated(oldLocInfo->locatorType) ||
+ !IsLocatorDistributedByValue(newLocInfo->locatorType))
return;
/* Get the list of nodes that are added to the relation */
@@ -243,8 +243,8 @@ pgxc_redist_build_replicate(RedistribState *distribState,
return;
/* Case of a replicated table whose set of nodes is changed */
- if (!IsRelationReplicated(newLocInfo) ||
- !IsRelationReplicated(oldLocInfo))
+ if (!IsLocatorReplicated(newLocInfo->locatorType) ||
+ !IsLocatorReplicated(oldLocInfo->locatorType))
return;
/* Get the list of nodes that are added to the relation */
@@ -410,6 +410,18 @@ distrib_copy_to(RedistribState *distribState)
get_namespace_name(RelationGetNamespace(rel)),
RelationGetRelationName(rel))));
+#ifdef XCP
+ /* Begin the COPY process */
+ DataNodeCopyBegin(copyState);
+
+ /* Create tuplestore storage */
+ store = tuplestore_begin_message(false, work_mem);
+
+ /* Then get rows and copy them to the tuplestore used for redistribution */
+ DataNodeCopyStore(
+ (PGXCNodeHandle **) getLocatorNodeMap(copyState->locator),
+ getLocatorNodeCount(copyState->locator), store);
+#else
/* Begin the COPY process */
copyState->connections = DataNodeCopyBegin(copyState->query_buf.data,
copyState->exec_nodes->nodeList,
@@ -425,6 +437,7 @@ distrib_copy_to(RedistribState *distribState)
NULL,
store, /* Tuplestore used for redistribution */
REMOTE_COPY_TUPLESTORE);
+#endif
/* Do necessary clean-up */
FreeRemoteCopyOptions(options);
@@ -450,8 +463,17 @@ distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes)
Relation rel;
RemoteCopyOptions *options;
RemoteCopyData *copyState;
+#ifndef XCP
bool replicated, contains_tuple = true;
+#endif
TupleDesc tupdesc;
+#ifdef XCP
+ /* May be needed to decode partitioning value */
+ int partIdx;
+ FmgrInfo in_function;
+ Oid typioparam;
+ int typmod;
+#endif
/* Nothing to do if on remote node */
if (IS_PGXC_DATANODE || IsConnFromCoord())
@@ -472,6 +494,14 @@ distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes)
RemoteCopy_GetRelationLoc(copyState, rel, NIL);
RemoteCopy_BuildStatement(copyState, rel, options, NIL, NIL);
+#ifdef XCP
+ /* Modify relation location as requested */
+ if (exec_nodes)
+ {
+ if (exec_nodes->nodeList)
+ copyState->rel_loc->nodeList = exec_nodes->nodeList;
+ }
+#else
/*
* When building COPY FROM command in redistribution list,
* use the list of nodes that has been calculated there.
@@ -482,8 +512,37 @@ distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes)
copyState->exec_nodes->nodeList = exec_nodes->nodeList;
copyState->rel_loc->nodeList = exec_nodes->nodeList;
}
+#endif
tupdesc = RelationGetDescr(rel);
+#ifdef XCP
+ if (AttributeNumberIsValid(copyState->rel_loc->partAttrNum))
+ {
+ Oid in_func_oid;
+ int dropped = 0;
+ int i;
+
+ partIdx = copyState->rel_loc->partAttrNum - 1;
+
+ /* prepare function to decode partitioning value */
+ getTypeInputInfo(copyState->dist_type,
+ &in_func_oid, &typioparam);
+ fmgr_info(in_func_oid, &in_function);
+ typmod = tupdesc->attrs[partIdx]->atttypmod;
+
+ /*
+ * Make partIdx pointing to correct field of the datarow.
+ * The data row does not contain data of dropped attributes, we should
+ * decrement partIdx appropriately
+ */
+ for (i = 0; i < partIdx; i++)
+ {
+ if (tupdesc->attrs[i]->attisdropped)
+ dropped++;
+ }
+ partIdx -= dropped;
+ }
+#endif
/* Inform client of operation being done */
ereport(DEBUG1,
@@ -491,6 +550,55 @@ distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes)
get_namespace_name(RelationGetNamespace(rel)),
RelationGetRelationName(rel))));
+#ifdef XCP
+ DataNodeCopyBegin(copyState);
+
+ /* Send each COPY message stored to remote nodes */
+ while (true)
+ {
+ char *data;
+ int len;
+ Datum value = (Datum) 0;
+ bool is_null = true;
+
+ /* Get message from the tuplestore */
+ data = tuplestore_getmessage(store, &len);
+ if (!data)
+ break;
+
+ /* Find value of distribution column if necessary */
+ if (AttributeNumberIsValid(copyState->rel_loc->partAttrNum))
+ {
+ char **fields;
+
+ /*
+ * Split message on an array of fields.
+ * Last \n is not included in converted message.
+ */
+ fields = CopyOps_RawDataToArrayField(tupdesc, data, len - 1);
+
+ /* Determine partitioning value */
+ if (fields[partIdx])
+ {
+ value = InputFunctionCall(&in_function, fields[partIdx],
+ typioparam, typmod);
+ is_null = false;
+ }
+ }
+
+ if (DataNodeCopyIn(data, len,
+ GET_NODES(copyState->locator, value, is_null, NULL),
+ (PGXCNodeHandle**) getLocatorResults(copyState->locator)))
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_EXCEPTION),
+ errmsg("Copy failed on a data node")));
+
+ /* Clean up */
+ pfree(data);
+ }
+ DataNodeCopyFinish(getLocatorNodeCount(copyState->locator),
+ (PGXCNodeHandle **) getLocatorNodeMap(copyState->locator));
+#else
/* Begin redistribution on remote nodes */
copyState->connections = DataNodeCopyBegin(copyState->query_buf.data,
copyState->exec_nodes->nodeList,
@@ -561,6 +669,7 @@ distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes)
DataNodeCopyFinish(copyState->connections,
replicated ? PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE) : -1,
replicated ? COMBINE_TYPE_SAME : COMBINE_TYPE_SUM);
+#endif
/* Lock is maintained until transaction commits */
relation_close(rel, NoLock);
@@ -721,8 +830,10 @@ distrib_delete_hash(RedistribState *distribState, ExecNodes *exec_nodes)
hashfuncname = get_compute_hash_function(hashtype, locinfo->locatorType);
/* Get distribution column name */
- if (IsRelationDistributedByValue(locinfo))
- colname = GetRelationDistribColumn(locinfo);
+ if (locinfo->locatorType == LOCATOR_TYPE_HASH)
+ colname = GetRelationHashColumn(locinfo);
+ else if (locinfo->locatorType == LOCATOR_TYPE_MODULO)
+ colname = GetRelationModuloColumn(locinfo);
else
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
@@ -859,7 +970,9 @@ distrib_execute_query(char *sql, bool is_temp, ExecNodes *exec_nodes)
/* Redistribution operations only concern Datanodes */
step->exec_type = EXEC_ON_DATANODES;
+#ifndef XCP
step->is_temp = is_temp;
+#endif
ExecRemoteUtility(step);
pfree(step->sql_statement);
pfree(step);
diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c
index ee5ef63efb..dc77212dfd 100644
--- a/src/backend/pgxc/nodemgr/nodemgr.c
+++ b/src/backend/pgxc/nodemgr/nodemgr.c
@@ -204,6 +204,15 @@ check_node_options(const char *node_name, List *options, char **node_host,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("PGXC node %s: Node type not specified",
node_name)));
+
+#ifdef XCP
+ if (node_type == PGXC_NODE_DATANODE && NumDataNodes >= MaxDataNodes)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("Too many datanodes, current value of max_data_nodes is %d",
+ MaxDataNodes)));
+
+#endif
}
/*
@@ -347,6 +356,9 @@ PgxcNodeListAndCount(void)
heap_endscan(scan);
heap_close(rel, AccessShareLock);
+ elog(DEBUG1, "Done pgxc_nodes scan: %d coordinators and %d datanodes",
+ *shmemNumCoords, *shmemNumDataNodes);
+
/* Finally sort the lists */
if (*shmemNumCoords > 1)
qsort(coDefs, *shmemNumCoords, sizeof(NodeDefinition), cmp_nodes);
@@ -372,6 +384,9 @@ PgxcNodeGetOids(Oid **coOids, Oid **dnOids,
{
LWLockAcquire(NodeTableLock, LW_SHARED);
+ elog(DEBUG1, "Get OIDs from table: %d coordinators and %d datanodes",
+ *shmemNumCoords, *shmemNumDataNodes);
+
if (num_coords)
*num_coords = *shmemNumCoords;
if (num_dns)
@@ -656,6 +671,13 @@ PgxcNodeAlter(AlterNodeStmt *stmt)
node_name)));
/* Check type dependency */
+#ifndef XCP
+ /*
+ * XCP:
+ * Initially node identify itself as a Coordinator and this should be
+ * changed for datanodes. In general, it should be safe to turn
+ * Coordinator to Datanode and back
+ */
if (node_type_old == PGXC_NODE_COORDINATOR &&
node_type == PGXC_NODE_DATANODE)
ereport(ERROR,
@@ -668,6 +690,7 @@ PgxcNodeAlter(AlterNodeStmt *stmt)
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("PGXC node %s: cannot alter Datanode to Coordinator",
node_name)));
+#endif
/* Update values for catalog entry */
MemSet(new_record, 0, sizeof(new_record));
diff --git a/src/backend/pgxc/plan/Makefile b/src/backend/pgxc/plan/Makefile
new file mode 100644
index 0000000000..c322c03656
--- /dev/null
+++ b/src/backend/pgxc/plan/Makefile
@@ -0,0 +1,19 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for rewrite
+#
+# Portions Copyright(C) 2010-2012 Postgres-XC Development Group
+#
+# IDENTIFICATION
+# $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/pgxc/plan
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = planner.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c
new file mode 100644
index 0000000000..d294063f5c
--- /dev/null
+++ b/src/backend/pgxc/plan/planner.c
@@ -0,0 +1,2282 @@
+/*-------------------------------------------------------------------------
+ *
+ * planner.c
+ *
+ * Functions for generating a PGXC style plan.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "miscadmin.h"
+#include "access/transam.h"
+#include "catalog/pg_aggregate.h"
+#include "catalog/pg_class.h"
+#include "catalog/pg_inherits_fn.h"
+#include "catalog/pg_namespace.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "catalog/pgxc_node.h"
+#include "commands/prepare.h"
+#include "executor/executor.h"
+#include "lib/stringinfo.h"
+#include "nodes/makefuncs.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/nodes.h"
+#include "nodes/parsenodes.h"
+#include "optimizer/clauses.h"
+#include "optimizer/planmain.h"
+#include "optimizer/planner.h"
+#include "optimizer/tlist.h"
+#include "parser/parse_agg.h"
+#include "parser/parse_func.h"
+#include "parser/parse_relation.h"
+#include "parser/parsetree.h"
+#include "parser/parse_oper.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/pgxc.h"
+#include "pgxc/locator.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/planner.h"
+#include "pgxc/postgresql_fdw.h"
+#include "tcop/pquery.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
+#include "utils/portal.h"
+#include "utils/syscache.h"
+#include "utils/numeric.h"
+#include "utils/memutils.h"
+#include "access/hash.h"
+#include "commands/tablecmds.h"
+#include "utils/timestamp.h"
+#include "utils/date.h"
+
+#ifndef XCP
+/* Forbid unsafe SQL statements */
+bool StrictStatementChecking = true;
+/* fast query shipping is enabled by default */
+bool enable_fast_query_shipping = true;
+
+static RemoteQuery *makeRemoteQuery(void);
+static void validate_part_col_updatable(const Query *query);
+static bool contains_temp_tables(List *rtable);
+static bool contains_only_pg_catalog(List *rtable);
+static void pgxc_handle_unsupported_stmts(Query *query);
+static PlannedStmt *pgxc_FQS_planner(Query *query, int cursorOptions,
+ ParamListInfo boundParams);
+static bool pgxc_query_needs_coord(Query *query);
+static ExecNodes *pgxc_is_query_shippable(Query *query, int query_level);
+static void pgxc_FQS_find_datanodes(Shippability_context *sc_context);
+static ExecNodes *pgxc_merge_exec_nodes(ExecNodes *exec_nodes1,
+ ExecNodes *exec_nodes2,
+ bool merge_dist_equijoin,
+ bool merge_replicated_only);
+static PlannedStmt *pgxc_handle_exec_direct(Query *query, int cursorOptions,
+ ParamListInfo boundParams);
+static RemoteQuery *pgxc_FQS_create_remote_plan(Query *query,
+ ExecNodes *exec_nodes,
+ bool is_exec_direct);
+static void pgxc_set_remote_parameters(PlannedStmt *plan, ParamListInfo boundParams);
+static ExecNodes *pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, Index varno,
+ Query *query);
+static bool pgxc_qual_hash_dist_equijoin(Relids varnos_1, Relids varnos_2,
+ Oid distcol_type, Node *quals,
+ List *rtable);
+static bool VarAttrIsPartAttr(Var *var, List *rtable);
+static void pgxc_set_shippability_reason(Shippability_context *context, ShippabilityStat reason);
+
+/*
+ * make_ctid_col_ref
+ *
+ * creates a Var for a column referring to ctid
+ */
+
+static Var *
+make_ctid_col_ref(Query *qry)
+{
+ ListCell *lc1, *lc2;
+ RangeTblEntry *rte1, *rte2;
+ int tableRTEs, firstTableRTENumber;
+ RangeTblEntry *rte_in_query = NULL;
+ AttrNumber attnum;
+ Oid vartypeid;
+ int32 type_mod;
+ Oid varcollid;
+
+ /*
+ * If the query has more than 1 table RTEs where both are different, we can not add ctid to the query target list
+ * We should in this case skip adding it to the target list and a WHERE CURRENT OF should then
+ * fail saying the query is not a simply update able scan of table
+ */
+
+ tableRTEs = 0;
+ foreach(lc1, qry->rtable)
+ {
+ rte1 = (RangeTblEntry *) lfirst(lc1);
+
+ if (rte1->rtekind == RTE_RELATION)
+ {
+ tableRTEs++;
+ if (tableRTEs > 1)
+ {
+ /*
+ * See if we get two RTEs in case we have two references
+ * to the same table with different aliases
+ */
+ foreach(lc2, qry->rtable)
+ {
+ rte2 = (RangeTblEntry *) lfirst(lc2);
+
+ if (rte2->rtekind == RTE_RELATION)
+ {
+ if (rte2->relid != rte1->relid)
+ {
+ return NULL;
+ }
+ }
+ }
+ continue;
+ }
+ rte_in_query = rte1;
+ }
+ }
+
+ if (tableRTEs > 1)
+ {
+ firstTableRTENumber = 0;
+ foreach(lc1, qry->rtable)
+ {
+ rte1 = (RangeTblEntry *) lfirst(lc1);
+ firstTableRTENumber++;
+ if (rte1->rtekind == RTE_RELATION)
+ {
+ break;
+ }
+ }
+ }
+ else
+ {
+ firstTableRTENumber = 1;
+ }
+
+ attnum = specialAttNum("ctid");
+ Assert(rte_in_query);
+ get_rte_attribute_type(rte_in_query, attnum, &vartypeid, &type_mod, &varcollid);
+ return makeVar(firstTableRTENumber, attnum, vartypeid, type_mod, varcollid, 0);
+}
+
+/*
+ * Returns whether or not the rtable (and its subqueries)
+ * only contain pg_catalog entries.
+ */
+static bool
+contains_only_pg_catalog(List *rtable)
+{
+ ListCell *item;
+
+ /* May be complicated. Before giving up, just check for pg_catalog usage */
+ foreach(item, rtable)
+ {
+ RangeTblEntry *rte = (RangeTblEntry *) lfirst(item);
+
+ if (rte->rtekind == RTE_RELATION)
+ {
+ if (get_rel_namespace(rte->relid) != PG_CATALOG_NAMESPACE)
+ return false;
+ }
+ else if (rte->rtekind == RTE_SUBQUERY &&
+ !contains_only_pg_catalog(rte->subquery->rtable))
+ return false;
+ }
+ return true;
+}
+
+
+/*
+ * Returns true if at least one temporary table is in use
+ * in query (and its subqueries)
+ */
+static bool
+contains_temp_tables(List *rtable)
+{
+ ListCell *item;
+
+ foreach(item, rtable)
+ {
+ RangeTblEntry *rte = (RangeTblEntry *) lfirst(item);
+
+ if (rte->rtekind == RTE_RELATION)
+ {
+ if (IsTempTable(rte->relid))
+ return true;
+ }
+ else if (rte->rtekind == RTE_SUBQUERY &&
+ contains_temp_tables(rte->subquery->rtable))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Create an instance of RemoteQuery and initialize fields
+ */
+static RemoteQuery *
+makeRemoteQuery(void)
+{
+ RemoteQuery *result = makeNode(RemoteQuery);
+ result->combine_type = COMBINE_TYPE_NONE;
+ result->exec_type = EXEC_ON_DATANODES;
+ result->exec_direct_type = EXEC_DIRECT_NONE;
+
+ return result;
+}
+
+/*
+ * get_plan_combine_type - determine combine type
+ *
+ * COMBINE_TYPE_SAME - for replicated updates
+ * COMBINE_TYPE_SUM - for hash and round robin updates
+ * COMBINE_TYPE_NONE - for operations where row_count is not applicable
+ *
+ * return NULL if it is not safe to be done in a single step.
+ */
+static CombineType
+get_plan_combine_type(Query *query, char baselocatortype)
+{
+
+ switch (query->commandType)
+ {
+ case CMD_INSERT:
+ case CMD_UPDATE:
+ case CMD_DELETE:
+ return baselocatortype == LOCATOR_TYPE_REPLICATED ?
+ COMBINE_TYPE_SAME : COMBINE_TYPE_SUM;
+
+ default:
+ return COMBINE_TYPE_NONE;
+ }
+ /* quiet compiler warning */
+ return COMBINE_TYPE_NONE;
+}
+
+/*
+ * get oid of the function whose name is passed as argument
+ */
+
+static Oid
+get_fn_oid(char *fn_name, Oid *p_rettype)
+{
+ Value *fn_nm;
+ List *fn_name_list;
+ FuncDetailCode fdc;
+ bool retset;
+ int nvargs;
+ Oid *true_typeids;
+ Oid func_oid;
+
+ fn_nm = makeString(fn_name);
+ fn_name_list = list_make1(fn_nm);
+
+ fdc = func_get_detail(fn_name_list,
+ NULL, /* argument expressions */
+ NULL, /* argument names */
+ 0, /* argument numbers */
+ NULL, /* argument types */
+ false, /* expand variable number or args */
+ false, /* expand defaults */
+ &func_oid, /* oid of the function - returned detail*/
+ p_rettype, /* function return type - returned detail */
+ &retset, /* - returned detail*/
+ &nvargs, /* - returned detail*/
+ &true_typeids, /* - returned detail */
+ NULL /* arguemnt defaults returned*/
+ );
+
+ pfree(fn_name_list);
+ if (fdc == FUNCDETAIL_NORMAL)
+ {
+ return func_oid;
+ }
+ return InvalidOid;
+}
+
+/*
+ * Append ctid to the field list of step queries to support update
+ * WHERE CURRENT OF. The ctid is not sent down to client but used as a key
+ * to find target tuple.
+ * PGXCTODO: Bug
+ * This function modifies the original query to add ctid
+ * and nodename in the targetlist. It should rather modify the targetlist of the
+ * query to be shipped by the RemoteQuery node.
+ */
+static void
+fetch_ctid_of(Plan *subtree, Query *query)
+{
+ /* recursively process subnodes */
+ if (innerPlan(subtree))
+ fetch_ctid_of(innerPlan(subtree), query);
+ if (outerPlan(subtree))
+ fetch_ctid_of(outerPlan(subtree), query);
+
+ /* we are only interested in RemoteQueries */
+ if (IsA(subtree, RemoteQuery))
+ {
+ RemoteQuery *step = (RemoteQuery *) subtree;
+ TargetEntry *te1;
+ Query *temp_qry;
+ FuncExpr *func_expr;
+ AttrNumber resno;
+ Oid funcid;
+ Oid rettype;
+ Var *ctid_expr;
+ MemoryContext oldcontext;
+ MemoryContext tmpcontext;
+
+ tmpcontext = AllocSetContextCreate(CurrentMemoryContext,
+ "Temp Context",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ oldcontext = MemoryContextSwitchTo(tmpcontext);
+
+ /* Copy the query tree to make changes to the target list */
+ temp_qry = copyObject(query);
+ /* Get the number of entries in the target list */
+ resno = list_length(temp_qry->targetList);
+
+ /* Make a ctid column ref expr to add in target list */
+ ctid_expr = make_ctid_col_ref(temp_qry);
+ if (ctid_expr == NULL)
+ {
+ MemoryContextSwitchTo(oldcontext);
+ MemoryContextDelete(tmpcontext);
+ return;
+ }
+
+ te1 = makeTargetEntry((Expr *)ctid_expr, resno+1, NULL, false);
+
+ /* add the target entry to the query target list */
+ temp_qry->targetList = lappend(temp_qry->targetList, te1);
+
+ /* PGXCTODO We can take this call in initialization rather than getting it always */
+
+ /* Get the Oid of the function */
+ funcid = get_fn_oid("pgxc_node_str", &rettype);
+ if (OidIsValid(funcid))
+ {
+ StringInfoData deparsed_qry;
+ TargetEntry *te2;
+
+ /* create a function expression */
+ func_expr = makeFuncExpr(funcid, rettype, NULL, InvalidOid, InvalidOid, COERCE_DONTCARE);
+ /* make a target entry for function call */
+ te2 = makeTargetEntry((Expr *)func_expr, resno+2, NULL, false);
+ /* add the target entry to the query target list */
+ temp_qry->targetList = lappend(temp_qry->targetList, te2);
+
+ initStringInfo(&deparsed_qry);
+ deparse_query(temp_qry, &deparsed_qry, NIL);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ if (step->sql_statement != NULL)
+ pfree(step->sql_statement);
+
+ step->sql_statement = pstrdup(deparsed_qry.data);
+
+ MemoryContextDelete(tmpcontext);
+ }
+ else
+ {
+ MemoryContextSwitchTo(oldcontext);
+ MemoryContextDelete(tmpcontext);
+ }
+ }
+}
+
+/*
+ * Build up a QueryPlan to execute on.
+ *
+ * This functions tries to find out whether
+ * 1. The statement can be shipped to the Datanode and Coordinator is needed
+ * only as a proxy - in which case, it creates a single node plan.
+ * 2. The statement can be evaluated on the Coordinator completely - thus no
+ * query shipping is involved and standard_planner() is invoked to plan the
+ * statement
+ * 3. The statement needs Coordinator as well as Datanode for evaluation -
+ * again we use standard_planner() to plan the statement.
+ *
+ * The plan generated in either of the above cases is returned.
+ */
+PlannedStmt *
+pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams)
+{
+ PlannedStmt *result;
+
+ /* handle the un-supported statements, obvious errors etc. */
+ pgxc_handle_unsupported_stmts(query);
+
+ result = pgxc_handle_exec_direct(query, cursorOptions, boundParams);
+ if (result)
+ return result;
+
+ /* see if can ship the query completely */
+ result = pgxc_FQS_planner(query, cursorOptions, boundParams);
+ if (result)
+ return result;
+
+ /* we need Coordinator for evaluation, invoke standard planner */
+ result = standard_planner(query, cursorOptions, boundParams);
+ pgxc_set_remote_parameters(result, boundParams);
+ return result;
+}
+
+static PlannedStmt *
+pgxc_handle_exec_direct(Query *query, int cursorOptions,
+ ParamListInfo boundParams)
+{
+ PlannedStmt *result = NULL;
+ PlannerGlobal *glob;
+ PlannerInfo *root;
+ /*
+ * if the query has its utility set, it could be an EXEC_DIRECT statement,
+ * check if it needs to be executed on Coordinator
+ */
+ if (query->utilityStmt &&
+ IsA(query->utilityStmt, RemoteQuery))
+ {
+ RemoteQuery *node = (RemoteQuery *)query->utilityStmt;
+ /* EXECUTE DIRECT statements on remote nodes don't need Coordinator */
+ if (node->exec_direct_type != EXEC_DIRECT_NONE &&
+ node->exec_direct_type != EXEC_DIRECT_LOCAL &&
+ node->exec_direct_type != EXEC_DIRECT_LOCAL_UTILITY)
+ {
+ glob = makeNode(PlannerGlobal);
+ glob->boundParams = boundParams;
+ /* Create a PlannerInfo data structure, usually it is done for a subquery */
+ root = makeNode(PlannerInfo);
+ root->parse = query;
+ root->glob = glob;
+ root->query_level = 1;
+ root->planner_cxt = CurrentMemoryContext;
+ /* build the PlannedStmt result */
+ result = makeNode(PlannedStmt);
+ /* Try and set what we can, rest must have been zeroed out by makeNode() */
+ result->commandType = query->commandType;
+ result->canSetTag = query->canSetTag;
+ /* Set result relations */
+ if (query->commandType != CMD_SELECT)
+ result->resultRelations = list_make1_int(query->resultRelation);
+
+ result->planTree = (Plan *)pgxc_FQS_create_remote_plan(query, NULL, true);
+ result->rtable = query->rtable;
+ /*
+ * We need to save plan dependencies, so that dropping objects will
+ * invalidate the cached plan if it depends on those objects. Table
+ * dependencies are available in glob->relationOids and all other
+ * dependencies are in glob->invalItems. These fields can be retrieved
+ * through set_plan_references().
+ */
+ result->planTree = set_plan_references(root, result->planTree);
+ result->relationOids = glob->relationOids;
+ result->invalItems = glob->invalItems;
+ }
+ }
+
+ /* Set existing remote parameters */
+ pgxc_set_remote_parameters(result, boundParams);
+
+ return result;
+}
+/*
+ * pgxc_handle_unsupported_stmts
+ * Throw error for the statements that can not be handled in XC
+ */
+static void
+pgxc_handle_unsupported_stmts(Query *query)
+{
+ /*
+ * PGXCTODO: This validation will not be removed
+ * until we support moving tuples from one node to another
+ * when the partition column of a table is updated
+ */
+ if (query->commandType == CMD_UPDATE)
+ validate_part_col_updatable(query);
+
+ if (query->returningList)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("RETURNING clause not yet supported"))));
+}
+
+/*
+ * pgxc_FQS_planner
+ * The routine tries to see if the statement can be completely evaluated on the
+ * Datanodes. In such cases Coordinator is not needed to evaluate the statement,
+ * and just acts as a proxy. A statement can be completely shipped to the remote
+ * node if every row of the result can be evaluated on a single Datanode.
+ * For example:
+ *
+ * 1. SELECT * FROM tab1; where tab1 is a distributed table - Every row of the
+ * result set can be evaluated at a single Datanode. Hence this statement is
+ * completely shippable even though many Datanodes are involved in evaluating
+ * complete result set. In such case Coordinator will be able to gather rows
+ * arisign from individual Datanodes and proxy the result to the client.
+ *
+ * 2. SELECT count(*) FROM tab1; where tab1 is a distributed table - there is
+ * only one row in the result but it needs input from all the Datanodes. Hence
+ * this is not completely shippable.
+ *
+ * 3. SELECT count(*) FROM tab1; where tab1 is replicated table - since result
+ * can be obtained from a single Datanode, this is a completely shippable
+ * statement.
+ *
+ * fqs in the name of function is acronym for fast query shipping.
+ */
+static PlannedStmt *
+pgxc_FQS_planner(Query *query, int cursorOptions, ParamListInfo boundParams)
+{
+ PlannedStmt *result;
+ PlannerGlobal *glob;
+ PlannerInfo *root;
+ ExecNodes *exec_nodes;
+ Plan *top_plan;
+
+ /* Try by-passing standard planner, if fast query shipping is enabled */
+ if (!enable_fast_query_shipping)
+ return NULL;
+
+ /* Cursor options may come from caller or from DECLARE CURSOR stmt */
+ if (query->utilityStmt &&
+ IsA(query->utilityStmt, DeclareCursorStmt))
+ cursorOptions |= ((DeclareCursorStmt *) query->utilityStmt)->options;
+ /*
+ * If the query can not be or need not be shipped to the Datanodes, don't
+ * create any plan here. standard_planner() will take care of it.
+ */
+ exec_nodes = pgxc_is_query_shippable(query, 0);
+ if (exec_nodes == NULL)
+ return NULL;
+
+ glob = makeNode(PlannerGlobal);
+ glob->boundParams = boundParams;
+ /* Create a PlannerInfo data structure, usually it is done for a subquery */
+ root = makeNode(PlannerInfo);
+ root->parse = query;
+ root->glob = glob;
+ root->query_level = 1;
+ root->planner_cxt = CurrentMemoryContext;
+
+ /*
+ * We decided to ship the query to the Datanode/s, create a RemoteQuery node
+ * for the same.
+ */
+ top_plan = (Plan *)pgxc_FQS_create_remote_plan(query, exec_nodes, false);
+ /*
+ * If creating a plan for a scrollable cursor, make sure it can run
+ * backwards on demand. Add a Material node at the top at need.
+ */
+ if (cursorOptions & CURSOR_OPT_SCROLL)
+ {
+ if (!ExecSupportsBackwardScan(top_plan))
+ top_plan = materialize_finished_plan(top_plan);
+ }
+
+ /*
+ * Just before creating the PlannedStmt, do some final cleanup
+ * We need to save plan dependencies, so that dropping objects will
+ * invalidate the cached plan if it depends on those objects. Table
+ * dependencies are available in glob->relationOids and all other
+ * dependencies are in glob->invalItems. These fields can be retrieved
+ * through set_plan_references().
+ */
+ top_plan = set_plan_references(root, top_plan);
+
+ /* build the PlannedStmt result */
+ result = makeNode(PlannedStmt);
+ /* Try and set what we can, rest must have been zeroed out by makeNode() */
+ result->commandType = query->commandType;
+ result->canSetTag = query->canSetTag;
+ result->utilityStmt = query->utilityStmt;
+
+ /* Set result relations */
+ if (query->commandType != CMD_SELECT)
+ result->resultRelations = list_make1_int(query->resultRelation);
+ result->planTree = top_plan;
+ result->rtable = query->rtable;
+ result->relationOids = glob->relationOids;
+ result->invalItems = glob->invalItems;
+
+ /*
+ * If query is DECLARE CURSOR fetch CTIDs and node names from the remote node
+ * Use CTID as a key to update/delete tuples on remote nodes when handling
+ * WHERE CURRENT OF.
+ */
+ if (query->utilityStmt && IsA(query->utilityStmt, DeclareCursorStmt))
+ fetch_ctid_of(result->planTree, query);
+
+ /* Set existing remote parameters */
+ pgxc_set_remote_parameters(result, boundParams);
+
+ return result;
+}
+
+static RemoteQuery *
+pgxc_FQS_create_remote_plan(Query *query, ExecNodes *exec_nodes, bool is_exec_direct)
+{
+ RemoteQuery *query_step;
+ StringInfoData buf;
+ RangeTblEntry *dummy_rte;
+
+ /* EXECUTE DIRECT statements have their RemoteQuery node already built when analyzing */
+ if (is_exec_direct)
+ {
+ Assert(IsA(query->utilityStmt, RemoteQuery));
+ query_step = (RemoteQuery *)query->utilityStmt;
+ query->utilityStmt = NULL;
+ }
+ else
+ {
+ query_step = makeRemoteQuery();
+ query_step->exec_nodes = exec_nodes;
+ }
+
+ Assert(query_step->exec_nodes);
+
+ /* Datanodes should finalise the results of this query */
+ query->qry_finalise_aggs = true;
+
+ /* Deparse query tree to get step query. */
+ if ( query_step->sql_statement == NULL )
+ {
+ initStringInfo(&buf);
+ deparse_query(query, &buf, NIL);
+ query_step->sql_statement = pstrdup(buf.data);
+ pfree(buf.data);
+ }
+ /*
+ * PGXCTODO: we may route this same Query structure through
+ * standard_planner, where we don't want Datanodes to finalise the results.
+ * Turn it off. At some point, we will avoid routing the same query
+ * structure through the standard_planner by modifying it only when it's not
+ * be routed through standard_planner.
+ */
+ query->qry_finalise_aggs = false;
+ /* Optimize multi-node handling */
+ query_step->read_only = (query->commandType == CMD_SELECT && !query->hasForUpdate);
+ query_step->has_row_marks = query->hasForUpdate;
+
+ /* Check if temporary tables are in use in query */
+ /* PGXC_FQS_TODO: scanning the rtable again for the queries should not be
+ * needed. We should be able to find out if the query has a temporary object
+ * while finding nodes for the objects. But there is no way we can convey
+ * that information here. Till such a connection is available, this is it.
+ */
+ if (contains_temp_tables(query->rtable))
+ query_step->is_temp = true;
+
+ /*
+ * We need to evaluate some expressions like the ExecNodes->en_expr at
+ * Coordinator, prepare those for evaluation. Ideally we should call
+ * preprocess_expression, but it needs PlannerInfo structure for the same
+ */
+ fix_opfuncids((Node *)(query_step->exec_nodes->en_expr));
+ /*
+ * PGXCTODO
+ * When Postgres runs insert into t (a) values (1); against table
+ * defined as create table t (a int, b int); the plan is looking
+ * like insert into t (a,b) values (1,null);
+ * Later executor is verifying plan, to make sure table has not
+ * been altered since plan has been created and comparing table
+ * definition with plan target list and output error if they do
+ * not match.
+ * I could not find better way to generate targetList for pgxc plan
+ * then call standard planner and take targetList from the plan
+ * generated by Postgres.
+ */
+ query_step->combine_type = get_plan_combine_type(
+ query, query_step->exec_nodes->baselocatortype);
+
+ /*
+ * Create a dummy RTE for the remote query being created. Append the dummy
+ * range table entry to the range table. Note that this modifies the master
+ * copy the caller passed us, otherwise e.g EXPLAIN VERBOSE will fail to
+ * find the rte the Vars built below refer to. Also create the tuple
+ * descriptor for the result of this query from the base_tlist (targetlist
+ * we used to generate the remote node query).
+ */
+ dummy_rte = makeNode(RangeTblEntry);
+ dummy_rte->rtekind = RTE_REMOTE_DUMMY;
+ /* Use a dummy relname... */
+ if (is_exec_direct)
+ dummy_rte->relname = "__EXECUTE_DIRECT__";
+ else
+ dummy_rte->relname = "__REMOTE_FQS_QUERY__";
+ dummy_rte->eref = makeAlias("__REMOTE_FQS_QUERY__", NIL);
+ /* Rest will be zeroed out in makeNode() */
+
+ query->rtable = lappend(query->rtable, dummy_rte);
+ query_step->scan.scanrelid = list_length(query->rtable);
+ query_step->scan.plan.targetlist = query->targetList;
+ query_step->base_tlist = query->targetList;
+
+ return query_step;
+}
+
+/*
+ * pgxc_query_needs_coord
+ * Check if the query needs Coordinator for evaluation or it can be completely
+ * evaluated on Coordinator. Return true if so, otherwise return false.
+ */
+static bool
+pgxc_query_needs_coord(Query *query)
+{
+ /*
+ * If the query is an EXEC DIRECT on the same Coordinator where it's fired,
+ * it should not be shipped
+ */
+ if (query->is_local)
+ return true;
+ /*
+ * If the query involves just the catalog tables, and is not an EXEC DIRECT
+ * statement, it can be evaluated completely on the Coordinator. No need to
+ * involve Datanodes.
+ */
+ if (contains_only_pg_catalog(query->rtable))
+ return true;
+
+
+ /* Allow for override */
+ if (query->commandType != CMD_SELECT &&
+ query->commandType != CMD_INSERT &&
+ query->commandType != CMD_UPDATE &&
+ query->commandType != CMD_DELETE)
+ {
+ if (StrictStatementChecking)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ (errmsg("This command is not yet supported."))));
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Set the given reason in Shippability_context indicating why the query can not be
+ * shipped directly to the Datanodes.
+ */
+static void
+pgxc_set_shippability_reason(Shippability_context *context, ShippabilityStat reason)
+{
+ context->sc_shippability = bms_add_member(context->sc_shippability, reason);
+}
+
+/*
+ * See if a given reason is why the query can not be shipped directly
+ * to the Datanodes.
+ */
+bool
+pgxc_test_shippability_reason(Shippability_context *context, ShippabilityStat reason)
+{
+ return bms_is_member(reason, context->sc_shippability);
+}
+
+/*
+ * pgxc_is_query_shippable
+ * This function calls the query walker to analyse the query to gather
+ * information like Constraints under which the query can be shippable, nodes
+ * on which the query is going to be executed etc.
+ * Based on the information gathered, it decides whether the query can be
+ * executed on Datanodes directly without involving Coordinator.
+ * If the query is shippable this routine also returns the nodes where the query
+ * should be shipped. If the query is not shippable, it returns NULL.
+ */
+static ExecNodes *
+pgxc_is_query_shippable(Query *query, int query_level)
+{
+ Shippability_context sc_context;
+ ExecNodes *exec_nodes;
+ bool canShip = true;
+ Bitmapset *shippability;
+
+ memset(&sc_context, 0, sizeof(sc_context));
+ /* let's assume that by default query is shippable */
+ sc_context.sc_query = query;
+ sc_context.sc_query_level = query_level;
+ sc_context.sc_for_expr = false;
+
+ /*
+ * We might have already decided not to ship the query to the Datanodes, but
+ * still walk it anyway to find out if there are any subqueries which can be
+ * shipped.
+ */
+ pgxc_shippability_walker((Node *)query, &sc_context);
+ /*
+ * We have merged the nodelists and distributions of all subqueries seen in
+ * the query tree, merge it with the same obtained for the relations
+ * involved in the query.
+ * PGXC_FQS_TODO:
+ * Merge the subquery ExecNodes if both of them are replicated.
+ * The logic to merge node lists with other distribution
+ * strategy is not clear yet.
+ */
+ exec_nodes = sc_context.sc_exec_nodes;
+ if (exec_nodes)
+ exec_nodes = pgxc_merge_exec_nodes(exec_nodes,
+ sc_context.sc_subquery_en, false,
+ true);
+
+ /*
+ * Look at the information gathered by the walker in Shippability_context and that
+ * in the Query structure to decide whether we should ship this query
+ * directly to the Datanode or not
+ */
+
+ /*
+ * If the planner was not able to find the Datanodes to the execute the
+ * query, the query is not completely shippable. So, return NULL
+ */
+ if (!exec_nodes)
+ return NULL;
+
+ /* Copy the shippability reasons. We modify the copy for easier handling.
+ * The original can be saved away */
+ shippability = bms_copy(sc_context.sc_shippability);
+
+ /*
+ * If the query has an expression which renders the shippability to single
+ * node, and query needs to be shipped to more than one node, it can not be
+ * shipped
+ */
+ if (bms_is_member(SS_NEED_SINGLENODE, shippability))
+ {
+ /* We handled the reason here, reset it */
+ shippability = bms_del_member(shippability, SS_NEED_SINGLENODE);
+ /* if nodeList has no nodes, it ExecNodes will have other means to know
+ * the nodes where to execute like distribution column expression. We
+ * can't tell how many nodes the query will be executed on, hence treat
+ * that as multiple nodes.
+ */
+ if (list_length(exec_nodes->nodeList) != 1)
+ canShip = false;
+ }
+ /* We have delt with aggregates as well, delete the Has aggregates status */
+ shippability = bms_del_member(shippability, SS_HAS_AGG_EXPR);
+
+ /* Can not ship the query for some reason */
+ if (!bms_is_empty(shippability))
+ canShip = false;
+
+ /* Always keep this at the end before checking canShip and return */
+ if (!canShip && exec_nodes)
+ FreeExecNodes(&exec_nodes);
+ /* If query is to be shipped, we should know where to execute the query */
+ Assert (!canShip || exec_nodes);
+
+ bms_free(shippability);
+ shippability = NULL;
+
+ return exec_nodes;
+}
+
+/*
+ * pgxc_merge_exec_nodes
+ * The routine combines the two exec_nodes passed such that the resultant
+ * exec_node corresponds to the JOIN of respective relations.
+ * If both exec_nodes can not be merged, it returns NULL.
+ */
+static ExecNodes *
+pgxc_merge_exec_nodes(ExecNodes *en1, ExecNodes *en2, bool merge_dist_equijoin,
+ bool merge_replicated_only)
+{
+ ExecNodes *merged_en = makeNode(ExecNodes);
+ ExecNodes *tmp_en;
+
+ /* If either of exec_nodes are NULL, return the copy of other one */
+ if (!en1)
+ {
+ tmp_en = copyObject(en2);
+ return tmp_en;
+ }
+ if (!en2)
+ {
+ tmp_en = copyObject(en1);
+ return tmp_en;
+ }
+
+ /* Following cases are not handled in this routine */
+ /* PGXC_FQS_TODO how should we handle table usage type? */
+ if (en1->primarynodelist || en2->primarynodelist ||
+ en1->en_expr || en2->en_expr ||
+ OidIsValid(en1->en_relid) || OidIsValid(en2->en_relid) ||
+ en1->accesstype != RELATION_ACCESS_READ || en2->accesstype != RELATION_ACCESS_READ)
+ return NULL;
+
+ if (IsLocatorReplicated(en1->baselocatortype) &&
+ IsLocatorReplicated(en2->baselocatortype))
+ {
+ /*
+ * Replicated/replicated join case
+ * Check that replicated relation is not disjoint
+ * with initial relation which is also replicated.
+ * If there is a common portion of the node list between
+ * the two relations, other rtables have to be checked on
+ * this restricted list.
+ */
+ merged_en->nodeList = list_intersection_int(en1->nodeList,
+ en2->nodeList);
+ merged_en->baselocatortype = LOCATOR_TYPE_REPLICATED;
+ /* No intersection, so has to go though standard planner... */
+ if (!merged_en->nodeList)
+ FreeExecNodes(&merged_en);
+ return merged_en;
+ }
+
+ /*
+ * We are told to merge the nodelists if both the distributions are
+ * replicated. We checked that above, so bail out
+ */
+ if (merge_replicated_only)
+ {
+ FreeExecNodes(&merged_en);
+ return merged_en;
+ }
+
+ if (IsLocatorReplicated(en1->baselocatortype) &&
+ IsLocatorColumnDistributed(en2->baselocatortype))
+ {
+ List *diff_nodelist = NULL;
+ /*
+ * Replicated/distributed join case.
+ * Node list of distributed table has to be included
+ * in node list of replicated table.
+ */
+ diff_nodelist = list_difference_int(en2->nodeList, en1->nodeList);
+ /*
+ * If the difference list is not empty, this means that node list of
+ * distributed table is not completely mapped by node list of replicated
+ * table, so go through standard planner.
+ */
+ if (diff_nodelist)
+ FreeExecNodes(&merged_en);
+ else
+ {
+ merged_en->nodeList = list_copy(en2->nodeList);
+ merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED;
+ }
+ return merged_en;
+ }
+
+ if (IsLocatorColumnDistributed(en1->baselocatortype) &&
+ IsLocatorReplicated(en2->baselocatortype))
+ {
+ List *diff_nodelist = NULL;
+ /*
+ * Distributed/replicated join case.
+ * Node list of distributed table has to be included
+ * in node list of replicated table.
+ */
+ diff_nodelist = list_difference_int(en1->nodeList, en2->nodeList);
+
+ /*
+ * If the difference list is not empty, this means that node list of
+ * distributed table is not completely mapped by node list of replicated
+ * table, so go through standard planner.
+ */
+ if (diff_nodelist)
+ FreeExecNodes(&merged_en);
+ else
+ {
+ merged_en->nodeList = list_copy(en1->nodeList);
+ merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED;
+ }
+ return merged_en;
+ }
+
+ if (IsLocatorColumnDistributed(en1->baselocatortype) &&
+ IsLocatorColumnDistributed(en2->baselocatortype))
+ {
+ /*
+ * Distributed/distributed case
+ * If the caller has suggested that this is an equi-join between two
+ * distributed results, check if both are distributed by the same
+ * distribution strategy, and have the same nodes in the distribution
+ * node list. The caller should have made sure that distribution column
+ * type is same.
+ */
+ if (merge_dist_equijoin &&
+ en1->baselocatortype == en2->baselocatortype &&
+ !list_difference_int(en1->nodeList, en2->nodeList) &&
+ !list_difference_int(en2->nodeList, en1->nodeList))
+ {
+ merged_en->nodeList = list_copy(en1->nodeList);
+ merged_en->baselocatortype = en1->baselocatortype;
+ }
+ else if (list_length(en1->nodeList) == 1 && list_length(en2->nodeList) == 1)
+ {
+ merged_en->nodeList = list_intersection_int(en1->nodeList,
+ en2->nodeList);
+ merged_en->baselocatortype = LOCATOR_TYPE_DISTRIBUTED;
+ }
+ else
+ FreeExecNodes(&merged_en);
+ return merged_en;
+ }
+
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+#ifdef XCP
+ errmsg("Postgres-XL does not support this distribution type yet"),
+#else
+ errmsg("Postgres-XC does not support this distribution type yet"),
+#endif
+ errdetail("The feature is not currently supported")));
+
+ /* Keep compiler happy */
+ return NULL;
+}
+
+static void
+pgxc_FQS_find_datanodes(Shippability_context *sc_context)
+{
+ Query *query = sc_context->sc_query;
+ ListCell *rt;
+ ExecNodes *exec_nodes = NULL;
+ bool canShip = true;
+ Index varno = 0;
+
+ /* No query, no nodes to execute! */
+ if (!query)
+ {
+ sc_context->sc_exec_nodes = NULL;
+ return;
+ }
+
+ /*
+ * For every range table entry,
+ * 1. Find out the Datanodes needed for that range table
+ * 2. Merge these Datanodes with the already available Datanodes
+ * 3. If the merge is unsuccessful, we can not ship this query directly to
+ * the Datanode/s
+ */
+ foreach(rt, query->rtable)
+ {
+ RangeTblEntry *rte = (RangeTblEntry *) lfirst(rt);
+ Oid distcol_type; /* TODO mostly this is not needed */
+ Relids dist_varnos;
+
+ varno++;
+ switch (rte->rtekind)
+ {
+ case RTE_RELATION:
+ {
+ ExecNodes *rel_exec_nodes;
+ ExecNodes *tmp_en;
+ bool merge_dist_equijoin = false;
+ /*
+ * In case of inheritance, child tables can have completely different
+ * Datanode distribution than parent. To handle inheritance we need
+ * to merge the Datanodes of the children table as well. The inheritance
+ * is resolved during planning(?), so we may not have the RTEs of the
+ * children here. Also, the exact method of merging Datanodes of the
+ * children is not known yet. So, when inheritance is requested, query
+ * can not be shipped.
+ */
+ if (rte->inh)
+ {
+ /*
+ * See prologue of has_subclass, we might miss on the
+ * optimization because has_subclass can return true
+ * even if there aren't any subclasses, but it's ok
+ */
+ if (has_subclass(rte->relid))
+ {
+ canShip = false;
+ break;
+ }
+ }
+
+ if (rte->relkind != RELKIND_RELATION)
+ {
+ canShip = false;
+ break;
+ }
+ rel_exec_nodes = pgxc_FQS_get_relation_nodes(rte,varno, query);
+ if (!rel_exec_nodes)
+ {
+ /*
+ * No information about the location of relation in XC,
+ * a local table OR system catalog. The query can not be
+ * pushed.
+ */
+ canShip = false;
+ break;
+ }
+ if (varno == 1)
+ {
+ if (IsLocatorColumnDistributed(rel_exec_nodes->baselocatortype))
+ {
+ RelationLocInfo *rel_loc_info = GetRelationLocInfo(rte->relid);
+ distcol_type = get_atttype(rte->relid,
+ rel_loc_info->partAttrNum);
+ dist_varnos = bms_make_singleton(varno);
+ }
+ else
+ {
+ distcol_type = InvalidOid;
+ dist_varnos = NULL;
+ }
+ }
+ if (exec_nodes &&
+ IsLocatorDistributedByValue(exec_nodes->baselocatortype) &&
+ OidIsValid(distcol_type) && bms_num_members(dist_varnos) > 0 &&
+ exec_nodes->baselocatortype == rel_exec_nodes->baselocatortype)
+ {
+ /*
+ * If the already reduced JOINs is distributed the same way
+ * as the current relation, check if there exists an
+ * equi-join condition between the relations and the data type
+ * of distribution column involved is same for both the
+ * relations
+ */
+ if (pgxc_qual_hash_dist_equijoin(dist_varnos,
+ bms_make_singleton(varno),
+ distcol_type,
+ query->jointree->quals,
+ query->rtable))
+ merge_dist_equijoin = true;
+ }
+
+ /* Save the current exec_nodes to be freed later */
+ tmp_en = exec_nodes;
+ exec_nodes = pgxc_merge_exec_nodes(exec_nodes, rel_exec_nodes,
+ merge_dist_equijoin,
+ false);
+ /*
+ * The JOIN is equijoin between distributed tables, and we could
+ * obtain the nodelist for pushing this JOIN, so add the current
+ * relation to the list of relations already JOINed in the same
+ * fashion.
+ */
+ if (exec_nodes && merge_dist_equijoin)
+ dist_varnos = bms_add_member(dist_varnos, varno);
+ FreeExecNodes(&tmp_en);
+ }
+ break;
+
+ case RTE_JOIN:
+ /* Is information here useful in some or other way? */
+ break;
+ case RTE_CTE:
+ case RTE_SUBQUERY:
+ case RTE_FUNCTION:
+ case RTE_VALUES:
+ default:
+ canShip = false;
+ }
+
+ if (!canShip || !exec_nodes)
+ break;
+ }
+
+ /*
+ * If we didn't find the Datanodes to ship the query to, we shouldn't ship
+ * the query :)
+ */
+ if (!exec_nodes || !(exec_nodes->nodeList || exec_nodes->en_expr))
+ canShip = false;
+
+ if (canShip)
+ {
+ /*
+ * If relations involved in the query are such that ultimate JOIN is
+ * replicated JOIN, choose only one of them. If one of them is a
+ * preferred node choose that one, otherwise choose the first one.
+ */
+ if (IsLocatorReplicated(exec_nodes->baselocatortype) &&
+ exec_nodes->accesstype == RELATION_ACCESS_READ)
+ {
+ List *tmp_list = exec_nodes->nodeList;
+ ListCell *item;
+ int nodeid = -1;
+ foreach(item, exec_nodes->nodeList)
+ {
+ int cnt_nodes;
+ for (cnt_nodes = 0;
+ cnt_nodes < num_preferred_data_nodes && nodeid < 0;
+ cnt_nodes++)
+ {
+ if (PGXCNodeGetNodeId(preferred_data_node[cnt_nodes],
+ PGXC_NODE_DATANODE) == lfirst_int(item))
+ nodeid = lfirst_int(item);
+ }
+ if (nodeid >= 0)
+ break;
+ }
+ if (nodeid < 0)
+ exec_nodes->nodeList = list_make1_int(linitial_int(exec_nodes->nodeList));
+ else
+ exec_nodes->nodeList = list_make1_int(nodeid);
+ list_free(tmp_list);
+ }
+ sc_context->sc_exec_nodes = exec_nodes;
+ }
+ else if (exec_nodes)
+ {
+ FreeExecNodes(&exec_nodes);
+ }
+ return;
+}
+
+static bool
+pgxc_qual_hash_dist_equijoin(Relids varnos_1, Relids varnos_2, Oid distcol_type,
+ Node *quals, List *rtable)
+{
+ List *lquals;
+ ListCell *qcell;
+
+ /*
+ * Make a copy of the argument bitmaps, it will be modified by
+ * bms_first_member().
+ */
+ varnos_1 = bms_copy(varnos_1);
+ varnos_2 = bms_copy(varnos_2);
+
+ lquals = make_ands_implicit((Expr *)quals);
+ foreach(qcell, lquals)
+ {
+ Expr *qual_expr = (Expr *)lfirst(qcell);
+ OpExpr *op;
+ Var *lvar;
+ Var *rvar;
+
+ if (!IsA(qual_expr, OpExpr))
+ continue;
+ op = (OpExpr *)qual_expr;
+ /* If not a binary operator, it can not be '='. */
+ if (list_length(op->args) != 2)
+ continue;
+
+ /*
+ * Check if both operands are Vars, if not check next expression */
+ if (IsA(linitial(op->args), Var) && IsA(lsecond(op->args), Var))
+ {
+ lvar = (Var *)linitial(op->args);
+ rvar = (Var *)lsecond(op->args);
+ }
+ else
+ continue;
+
+ /*
+ * If the data types of both the columns are not same, continue. Hash
+ * and Modulo of a the same bytes will be same if the data types are
+ * same. So, only when the data types of the columns are same, we can
+ * ship a distributed JOIN to the Datanodes
+ */
+ if (exprType((Node *)lvar) != exprType((Node *)rvar))
+ continue;
+
+ /* if the vars do not correspond to the required varnos, continue. */
+ if ((bms_is_member(lvar->varno, varnos_1) && bms_is_member(rvar->varno, varnos_2)) ||
+ (bms_is_member(lvar->varno, varnos_2) && bms_is_member(rvar->varno, varnos_1)))
+ {
+ if (!VarAttrIsPartAttr(lvar, rtable) ||
+ !VarAttrIsPartAttr(rvar, rtable))
+ continue;
+ }
+ else
+ continue;
+ /*
+ * If the operator is not an assignment operator, check next
+ * constraint. An operator is an assignment operator if it's
+ * mergejoinable or hashjoinable. Beware that not every assignment
+ * operator is mergejoinable or hashjoinable, so we might leave some
+ * oportunity. But then we have to rely on the opname which may not
+ * be something we know to be equality operator as well.
+ */
+ if (!op_mergejoinable(op->opno, exprType((Node *)lvar)) &&
+ !op_hashjoinable(op->opno, exprType((Node *)lvar)))
+ continue;
+ /* Found equi-join condition on distribution columns */
+ return true;
+ }
+ return false;
+}
+
+static bool VarAttrIsPartAttr(Var *var, List *rtable)
+{
+ RangeTblEntry *rte = rt_fetch(var->varno, rtable);
+ RelationLocInfo *rel_loc_info;
+ /* distribution column only applies to the relations */
+ if (rte->rtekind != RTE_RELATION ||
+ rte->relkind != RELKIND_RELATION)
+ return false;
+ rel_loc_info = GetRelationLocInfo(rte->relid);
+ if (!rel_loc_info)
+ return false;
+ if (var->varattno == rel_loc_info->partAttrNum)
+ return true;
+ return false;
+}
+/*
+ * pgxc_FQS_get_relation_nodes
+ * For FQS return ExecNodes structure so as to decide which Datanodes the query
+ * should execute on. If it is possible to set the node list directly, set it.
+ * Otherwise set the appropriate distribution column expression or relid in
+ * ExecNodes structure.
+ */
+static ExecNodes *
+pgxc_FQS_get_relation_nodes(RangeTblEntry *rte, Index varno, Query *query)
+{
+ CmdType command_type = query->commandType;
+ bool for_update = query->rowMarks ? true : false;
+ ExecNodes *rel_exec_nodes;
+ RelationAccessType rel_access = RELATION_ACCESS_READ;
+ RelationLocInfo *rel_loc_info;
+
+ Assert(rte == rt_fetch(varno, (query->rtable)));
+
+ switch (command_type)
+ {
+ case CMD_SELECT:
+ if (for_update)
+ rel_access = RELATION_ACCESS_READ_FOR_UPDATE;
+ else
+ rel_access = RELATION_ACCESS_READ;
+ break;
+
+ case CMD_UPDATE:
+ case CMD_DELETE:
+ rel_access = RELATION_ACCESS_UPDATE;
+ break;
+
+ case CMD_INSERT:
+ rel_access = RELATION_ACCESS_INSERT;
+ break;
+
+ default:
+ /* should not happen, but */
+ elog(ERROR, "Unrecognised command type %d", command_type);
+ break;
+ }
+
+
+ rel_loc_info = GetRelationLocInfo(rte->relid);
+ /* If we don't know about the distribution of relation, bail out */
+ if (!rel_loc_info)
+ return NULL;
+
+ /*
+ * Find out the datanodes to execute this query on.
+ * PGXC_FQS_TODO: for now, we apply node reduction only when there is only
+ * one relation involved in the query. If there are multiple distributed
+ * tables in the query and we apply node reduction here, we may fail to ship
+ * the entire join. We should apply node reduction transitively.
+ */
+ if (list_length(query->rtable) == 1)
+ rel_exec_nodes = GetRelationNodesByQuals(rte->relid, varno,
+ query->jointree->quals, rel_access);
+ else
+ rel_exec_nodes = GetRelationNodes(rel_loc_info, (Datum) 0,
+ true, InvalidOid, rel_access);
+
+ if (!rel_exec_nodes)
+ return NULL;
+ rel_exec_nodes->accesstype = rel_access;
+ /*
+ * If we are reading a replicated table, pick all the nodes where it
+ * resides. If the query has JOIN, it helps picking up a matching set of
+ * Datanodes for that JOIN. FQS planner will ultimately pick up one node if
+ * the JOIN is replicated.
+ */
+ if (rel_access == RELATION_ACCESS_READ &&
+ IsLocatorReplicated(rel_loc_info->locatorType))
+ {
+ list_free(rel_exec_nodes->nodeList);
+ rel_exec_nodes->nodeList = list_copy(rel_loc_info->nodeList);
+ }
+ else if (rel_access == RELATION_ACCESS_INSERT &&
+ IsLocatorDistributedByValue(rel_loc_info->locatorType))
+ {
+ ListCell *lc;
+ TargetEntry *tle;
+ /*
+ * If the INSERT is happening on a table distributed by value of a
+ * column, find out the
+ * expression for distribution column in the targetlist, and stick in
+ * in ExecNodes, and clear the nodelist. Execution will find
+ * out where to insert the row.
+ */
+ /* It is a partitioned table, get value by looking in targetList */
+ foreach(lc, query->targetList)
+ {
+ tle = (TargetEntry *) lfirst(lc);
+
+ if (tle->resjunk)
+ continue;
+ if (strcmp(tle->resname, rel_loc_info->partAttrName) == 0)
+ break;
+ }
+ /* Not found, bail out */
+ if (!lc)
+ return NULL;
+
+ Assert(tle);
+ /* We found the TargetEntry for the partition column */
+ list_free(rel_exec_nodes->primarynodelist);
+ rel_exec_nodes->primarynodelist = NULL;
+ list_free(rel_exec_nodes->nodeList);
+ rel_exec_nodes->nodeList = NULL;
+ rel_exec_nodes->en_expr = tle->expr;
+ rel_exec_nodes->en_relid = rel_loc_info->relid;
+ }
+ return rel_exec_nodes;
+}
+/*
+ * pgxc_shippability_walker
+ * walks the query/expression tree routed at the node passed in, gathering
+ * information which will help decide whether the query to which this node
+ * belongs is shippable to the Datanodes.
+ *
+ * The function should try to walk the entire tree analysing each subquery for
+ * shippability. If a subquery is shippable but not the whole query, we would be
+ * able to create a RemoteQuery node for that subquery, shipping it to the
+ * Datanode.
+ *
+ * Return value of this function is governed by the same rules as
+ * expression_tree_walker(), see prologue of that function for details.
+ */
+bool
+pgxc_shippability_walker(Node *node, Shippability_context *sc_context)
+{
+ if (node == NULL)
+ return false;
+
+ /* Below is the list of nodes that can appear in a query, examine each
+ * kind of node and find out under what conditions query with this node can
+ * be shippable. For each node, update the context (add fields if
+ * necessary) so that decision whether to FQS the query or not can be made.
+ */
+ switch(nodeTag(node))
+ {
+ /* Constants are always shippable */
+ case T_Const:
+ break;
+
+ /*
+ * For placeholder nodes the shippability of the node, depends upon the
+ * expression which they refer to. It will be checked separately, when
+ * that expression is encountered.
+ */
+ case T_CaseTestExpr:
+ break;
+
+ /*
+ * record_in() function throws error, thus requesting a result in the
+ * form of anonymous record from datanode gets into error. Hence, if the
+ * top expression of a target entry is ROW(), it's not shippable.
+ */
+ case T_TargetEntry:
+ {
+ TargetEntry *tle = (TargetEntry *)node;
+ if (tle->expr)
+ {
+ char typtype = get_typtype(exprType((Node *)tle->expr));
+ if (!typtype || typtype == TYPTYPE_PSEUDO)
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ }
+ }
+ break;
+
+ case T_SortGroupClause:
+ if (sc_context->sc_for_expr)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ break;
+
+ /*
+ * Nodes, which are shippable if the tree rooted under these nodes is
+ * shippable
+ */
+ case T_List:
+ case T_CoerceToDomainValue:
+ /*
+ * PGXCTODO: mostly, CoerceToDomainValue node appears in DDLs,
+ * do we handle DDLs here?
+ */
+ case T_FieldSelect:
+ case T_RangeTblRef:
+ case T_NamedArgExpr:
+ case T_BoolExpr:
+ /*
+ * PGXCTODO: we might need to take into account the kind of boolean
+ * operator we have in the quals and see if the corresponding
+ * function is immutable.
+ */
+ case T_RelabelType:
+ case T_CoerceViaIO:
+ case T_ArrayCoerceExpr:
+ case T_ConvertRowtypeExpr:
+ case T_CaseExpr:
+ case T_ArrayExpr:
+ case T_RowExpr:
+ case T_CollateExpr:
+ case T_CoalesceExpr:
+ case T_XmlExpr:
+ case T_NullTest:
+ case T_BooleanTest:
+ case T_CoerceToDomain:
+ break;
+
+ case T_ArrayRef:
+ /*
+ * When multiple values of of an array are updated at once
+ * FQS planner cannot yet handle SQL representation correctly.
+ * So disable FQS in this case and let standard planner manage it.
+ */
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ break;
+
+ case T_FieldStore:
+ /*
+ * PostgreSQL deparsing logic does not handle the FieldStore
+ * for more than one fields (see processIndirection()). So, let's
+ * handle it through standard planner, where whole row will be
+ * constructed.
+ */
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ break;
+
+ case T_SetToDefault:
+ /*
+ * PGXCTODO: we should actually check whether the default value to
+ * be substituted is shippable to the Datanode. Some cases like
+ * nextval() of a sequence can not be shipped to the Datanode, hence
+ * for now default values can not be shipped to the Datanodes
+ */
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ break;
+
+ case T_Var:
+ {
+ Var *var = (Var *)node;
+ /*
+ * if a subquery references an upper level variable, that query is
+ * not shippable, if shipped alone.
+ */
+ if (var->varlevelsup > sc_context->sc_max_varlevelsup)
+ sc_context->sc_max_varlevelsup = var->varlevelsup;
+ }
+ break;
+
+ case T_Param:
+ {
+ Param *param = (Param *)node;
+ /* PGXCTODO: Can we handle internally generated parameters? */
+ if (param->paramkind != PARAM_EXTERN)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ }
+ break;
+
+ case T_CurrentOfExpr:
+ {
+ /*
+ * Ideally we should not see CurrentOf expression here, it
+ * should have been replaced by the CTID = ? expression. But
+ * still, no harm in shipping it as is.
+ */
+ }
+ break;
+
+ case T_Aggref:
+ {
+ Aggref *aggref = (Aggref *)node;
+ /*
+ * An aggregate is completely shippable to the Datanode, if the
+ * whole group resides on that Datanode. This will be clear when
+ * we see the GROUP BY clause.
+ * agglevelsup is minimum of variable's varlevelsup, so we will
+ * set the sc_max_varlevelsup when we reach the appropriate
+ * VARs in the tree.
+ */
+ pgxc_set_shippability_reason(sc_context, SS_HAS_AGG_EXPR);
+ /*
+ * If a stand-alone expression to be shipped, is an
+ * 1. aggregate with ORDER BY, DISTINCT directives, it needs all
+ * the qualifying rows
+ * 2. aggregate without collection function
+ * 3. (PGXCTODO:)aggregate with polymorphic transition type, the
+ * the transition type needs to be resolved to correctly interpret
+ * the transition results from Datanodes.
+ * Hence, such an expression can not be shipped to the datanodes.
+ */
+ if (aggref->aggorder ||
+ aggref->aggdistinct ||
+ aggref->agglevelsup ||
+ !aggref->agghas_collectfn ||
+ IsPolymorphicType(aggref->aggtrantype))
+ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
+ }
+ break;
+
+ case T_FuncExpr:
+ {
+ FuncExpr *funcexpr = (FuncExpr *)node;
+ /*
+ * PGXC_FQS_TODO: it's too restrictive not to ship non-immutable
+ * functions to the Datanode. We need a better way to see what
+ * can be shipped to the Datanode and what can not be.
+ */
+ if (!is_immutable_func(funcexpr->funcid))
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ }
+ break;
+
+ case T_OpExpr:
+ case T_DistinctExpr: /* struct-equivalent to OpExpr */
+ case T_NullIfExpr: /* struct-equivalent to OpExpr */
+ {
+ /*
+ * All of these three are structurally equivalent to OpExpr, so
+ * cast the node to OpExpr and check if the operator function is
+ * immutable. See PGXC_FQS_TODO item for FuncExpr.
+ */
+ OpExpr *op_expr = (OpExpr *)node;
+ Oid opfuncid = OidIsValid(op_expr->opfuncid) ?
+ op_expr->opfuncid : get_opcode(op_expr->opno);
+ if (!OidIsValid(opfuncid) || !is_immutable_func(opfuncid))
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ }
+ break;
+
+ case T_ScalarArrayOpExpr:
+ {
+ /*
+ * Check if the operator function is shippable to the Datanode
+ * PGXC_FQS_TODO: see immutability note for FuncExpr above
+ */
+ ScalarArrayOpExpr *sao_expr = (ScalarArrayOpExpr *)node;
+ Oid opfuncid = OidIsValid(sao_expr->opfuncid) ?
+ sao_expr->opfuncid : get_opcode(sao_expr->opno);
+ if (!OidIsValid(opfuncid) || !is_immutable_func(opfuncid))
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ }
+ break;
+
+ case T_RowCompareExpr:
+ case T_MinMaxExpr:
+ {
+ /*
+ * PGXCTODO should we be checking the comparision operator
+ * functions as well, as we did for OpExpr OR that check is
+ * unnecessary. Operator functions are always shippable?
+ * Otherwise this node should be treated similar to other
+ * "shell" nodes.
+ */
+ }
+ break;
+
+ case T_Query:
+ {
+ Query *query = (Query *)node;
+
+ /* A stand-alone expression containing Query is not shippable */
+ if (sc_context->sc_for_expr)
+ {
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ break;
+ }
+ /* We are checking shippability of whole query, go ahead */
+
+ /* CREATE TABLE AS is not supported in FQS */
+ if (query->commandType == CMD_UTILITY &&
+ IsA(query->utilityStmt, CreateTableAsStmt))
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+
+ if (query->hasRecursive)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ /*
+ * If the query needs Coordinator for evaluation or the query can be
+ * completed on Coordinator itself, we don't ship it to the Datanode
+ */
+ if (pgxc_query_needs_coord(query))
+ pgxc_set_shippability_reason(sc_context, SS_NEEDS_COORD);
+
+ /* PGXC_FQS_TODO: It should be possible to look at the Query and find out
+ * whether it can be completely evaluated on the Datanode just like SELECT
+ * queries. But we need to be careful while finding out the Datanodes to
+ * execute the query on, esp. for the result relations. If one happens to
+ * remove/change this restriction, make sure you change
+ * pgxc_FQS_get_relation_nodes appropriately.
+ * For now DMLs with single rtable entry are candidates for FQS
+ */
+ if (query->commandType != CMD_SELECT && list_length(query->rtable) > 1)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+
+ /*
+ * In following conditions query is shippable when there is only one
+ * Datanode involved
+ * 1. the query has aggregagtes
+ * 2. the query has window functions
+ * 3. the query has ORDER BY clause
+ * 4. the query has Distinct clause
+ * 5. the query has limit and offset clause
+ *
+ * PGXC_FQS_TODO: Condition 1 above is really dependent upon the GROUP BY clause. If
+ * all rows in each group reside on the same Datanode, aggregates can be
+ * evaluated on that Datanode, thus condition 1 is has aggregates & the rows
+ * in any group reside on multiple Datanodes.
+ * PGXC_FQS_TODO: Condition 2 above is really dependent upon whether the distinct
+ * clause has distribution column in it. If the distinct clause has
+ * distribution column in it, we can ship DISTINCT clause to the Datanodes.
+ */
+ if (query->hasAggs || query->hasWindowFuncs || query->sortClause ||
+ query->distinctClause || query->groupClause || query->havingQual ||
+ query->limitOffset || query->limitCount)
+ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
+
+ /* walk the entire query tree to analyse the query */
+ if (query_tree_walker(query, pgxc_shippability_walker, sc_context, 0))
+ return true;
+
+ /*
+ * PGXC_FQS_TODO:
+ * There is a subquery in this query, which references Vars in the upper
+ * query. For now stop shipping such queries. We should get rid of this
+ * condition.
+ */
+ if (sc_context->sc_max_varlevelsup != 0)
+ pgxc_set_shippability_reason(sc_context, SS_VARLEVEL);
+
+ /*
+ * Walk the RangeTableEntries of the query and find the
+ * Datanodes needed for evaluating this query
+ */
+ pgxc_FQS_find_datanodes(sc_context);
+ }
+ break;
+
+ case T_FromExpr:
+ {
+ /* We don't expect FromExpr in a stand-alone expression */
+ if (sc_context->sc_for_expr)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+
+ /*
+ * We will be examining the range table entries separately and
+ * Join expressions are not candidate for FQS.
+ * If this is an INSERT query with quals, resulting from say
+ * conditional rule, we can not handle those in FQS, since there is
+ * not SQL representation for such quals.
+ */
+ if (sc_context->sc_query->commandType == CMD_INSERT &&
+ ((FromExpr *)node)->quals)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+
+ }
+ break;
+
+ case T_WindowFunc:
+ {
+ WindowFunc *winf = (WindowFunc *)node;
+ /*
+ * A window function can be evaluated on a Datanode if there is
+ * only one Datanode involved.
+ */
+ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
+
+ /*
+ * A window function is not shippable as part of a stand-alone
+ * expression. If the window function is non-immutable, it can not
+ * be shipped to the datanodes.
+ */
+ if (sc_context->sc_for_expr ||
+ !is_immutable_func(winf->winfnoid))
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ }
+ break;
+
+ case T_WindowClause:
+ {
+ /*
+ * A window function can be evaluated on a Datanode if there is
+ * only one Datanode involved.
+ */
+ pgxc_set_shippability_reason(sc_context, SS_NEED_SINGLENODE);
+
+ /*
+ * A window function is not shippable as part of a stand-alone
+ * expression
+ */
+ if (sc_context->sc_for_expr)
+ pgxc_set_shippability_reason(sc_context, SS_UNSHIPPABLE_EXPR);
+ }
+ break;
+
+ case T_JoinExpr:
+ /* We don't expect JoinExpr in a stand-alone expression */
+ if (sc_context->sc_for_expr)
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+
+ /*
+ * For JoinExpr in a Query
+ * The compatibility of joining ranges will be deduced while
+ * examining the range table of the query. Nothing to do here
+ */
+ break;
+
+ case T_SubLink:
+ {
+ SubLink *sublink = (SubLink *)node;
+ ExecNodes *sublink_en;
+ /*
+ * Walk the query and find the nodes where the query should be
+ * executed and node distribution. Merge this with the existing
+ * node list obtained for other subqueries. If merging fails, we
+ * can not ship the whole query.
+ */
+ if (IsA(sublink->subselect, Query))
+ sublink_en = pgxc_is_query_shippable((Query *)(sublink->subselect),
+ sc_context->sc_query_level);
+ else
+ sublink_en = NULL;
+
+ /* PGXCTODO free the old sc_subquery_en. */
+ /* If we already know that this query does not have a set of nodes
+ * to evaluate on, don't bother to merge again.
+ */
+ if (!pgxc_test_shippability_reason(sc_context, SS_NO_NODES))
+ {
+ sc_context->sc_subquery_en = pgxc_merge_exec_nodes(sublink_en,
+ sc_context->sc_subquery_en,
+ false,
+ true);
+ if (!sc_context->sc_subquery_en)
+ pgxc_set_shippability_reason(sc_context, SS_NO_NODES);
+ }
+ }
+ break;
+
+ case T_SubPlan:
+ case T_AlternativeSubPlan:
+ case T_CommonTableExpr:
+ case T_SetOperationStmt:
+ case T_PlaceHolderVar:
+ case T_AppendRelInfo:
+ case T_PlaceHolderInfo:
+ {
+ /* PGXCTODO: till we exhaust this list */
+ pgxc_set_shippability_reason(sc_context, SS_UNSUPPORTED_EXPR);
+ }
+ break;
+
+ default:
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(node));
+ break;
+ }
+ return expression_tree_walker(node, pgxc_shippability_walker, (void *)sc_context);
+}
+
+/*
+ * See if we can reduce the passed in RemoteQuery nodes to a single step.
+ *
+ * We need to check when we can further collapse already collapsed nodes.
+ * We cannot always collapse- we do not want to allow a replicated table
+ * to be used twice. That is if we have
+ *
+ * partitioned_1 -- replicated -- partitioned_2
+ *
+ * partitioned_1 and partitioned_2 cannot (usually) be safely joined only
+ * locally.
+ * We can do this by checking (may need tracking) what type it is,
+ * and looking at context->conditions->replicated_joins
+ *
+ * The following cases are possible, and whether or not it is ok
+ * to reduce.
+ *
+ * If the join between the two RemoteQuery nodes is replicated
+ *
+ * Node 1 Node 2
+ * rep-part folded rep-part folded ok to reduce?
+ * 0 0 0 1 1
+ * 0 0 1 1 1
+ * 0 1 0 1 1
+ * 0 1 1 1 1
+ * 1 1 1 1 0
+ *
+ *
+ * If the join between the two RemoteQuery nodes is replicated - partitioned
+ *
+ * Node 1 Node 2
+ * rep-part folded rep-part folded ok to reduce?
+ * 0 0 0 1 1
+ * 0 0 1 1 0
+ * 0 1 0 1 1
+ * 0 1 1 1 0
+ * 1 1 1 1 0
+ *
+ *
+ * If the join between the two RemoteQuery nodes is partitioned - partitioned
+ * it is always reducibile safely,
+ *
+ * RemoteQuery *innernode - the inner node
+ * RemoteQuery *outernode - the outer node
+ * List *rtable_list - rtables
+ * JoinPath *join_path - used to examine join restrictions
+ * PGXCJoinInfo *join_info - contains info about the join reduction
+ * join_info->partitioned_replicated is set to true if we have a partitioned-replicated
+ * join. We want to use replicated tables with non-replicated
+ * tables ony once. Only use this value if this function
+ * returns true.
+ */
+ExecNodes *
+IsJoinReducible(RemoteQuery *innernode, RemoteQuery *outernode, Relids in_relids, Relids out_relids,
+ Join *join, JoinPath *join_path, List *rtables)
+{
+ ExecNodes *join_exec_nodes;
+ bool merge_dist_equijoin = false;
+ bool merge_replicated_only;
+ ListCell *cell;
+ ExecNodes *inner_en = innernode->exec_nodes;
+ ExecNodes *outer_en = outernode->exec_nodes;
+ List *quals = join->joinqual;
+
+ /*
+ * When join type is other than INNER, we will get the unmatched rows on
+ * either side. The result will be correct only in case both the sides of
+ * join are replicated. In case one of the sides is replicated, and the
+ * unmatched results are not coming from that side, it might be possible to
+ * ship such join, but this needs to be validated from correctness
+ * perspective.
+ */
+ merge_replicated_only = (join->jointype != JOIN_INNER);
+
+ /*
+ * If both the relations are distributed with similar distribution strategy
+ * walk through the restriction info for this JOIN to find if there is an
+ * equality condition on the distributed columns of both the relations. In
+ * such case, we can reduce the JOIN if the distribution nodelist is also
+ * same.
+ */
+ if (IsLocatorDistributedByValue(inner_en->baselocatortype) &&
+ inner_en->baselocatortype == outer_en->baselocatortype &&
+ !merge_replicated_only)
+ {
+ foreach(cell, quals)
+ {
+ Node *qual = (Node *)lfirst(cell);
+ if (pgxc_qual_hash_dist_equijoin(in_relids, out_relids, InvalidOid,
+ qual, rtables))
+ {
+ merge_dist_equijoin = true;
+ break;
+ }
+ }
+ }
+ /*
+ * If the ExecNodes of inner and outer nodes can be merged, the JOIN is
+ * shippable
+ * PGXCTODO: Can we take into consideration the JOIN conditions to optimize
+ * further?
+ */
+ join_exec_nodes = pgxc_merge_exec_nodes(inner_en, outer_en,
+ merge_dist_equijoin,
+ merge_replicated_only);
+ return join_exec_nodes;
+}
+
+/*
+ * validate whether partition column of a table is being updated
+ */
+static void
+validate_part_col_updatable(const Query *query)
+{
+ RangeTblEntry *rte;
+ RelationLocInfo *rel_loc_info;
+ ListCell *lc;
+
+ /* Make sure there is one table at least */
+ if (query->rtable == NULL)
+ return;
+
+ rte = (RangeTblEntry *) list_nth(query->rtable, query->resultRelation - 1);
+
+
+ if (rte != NULL && rte->relkind != RELKIND_RELATION)
+ /* Bad relation type */
+ return;
+
+ /* See if we have the partitioned case. */
+ rel_loc_info = GetRelationLocInfo(rte->relid);
+
+ /* Any column updation on local relations is fine */
+ if (!rel_loc_info)
+ return;
+
+
+ /* Only LOCATOR_TYPE_HASH & LOCATOR_TYPE_MODULO should be checked */
+ if ( (rel_loc_info->partAttrName != NULL) &&
+ ( (rel_loc_info->locatorType == LOCATOR_TYPE_HASH) || (rel_loc_info->locatorType == LOCATOR_TYPE_MODULO) ) )
+ {
+ /* It is a partitioned table, check partition column in targetList */
+ foreach(lc, query->targetList)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(lc);
+
+ if (tle->resjunk)
+ continue;
+
+ /*
+ * See if we have a constant expression comparing against the
+ * designated partitioned column
+ */
+ if (strcmp(tle->resname, rel_loc_info->partAttrName) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_COLUMN_REFERENCE),
+ (errmsg("Partition column can't be updated in current version"))));
+ }
+ }
+}
+
+/*
+ * AddRemoteQueryNode
+ *
+ * Add a Remote Query node to launch on Datanodes.
+ * This can only be done for a query a Top Level to avoid
+ * duplicated queries on Datanodes.
+ */
+List *
+AddRemoteQueryNode(List *stmts, const char *queryString, RemoteQueryExecType remoteExecType, bool is_temp)
+{
+ List *result = stmts;
+
+ /* If node is appplied on EXEC_ON_NONE, simply return the list unchanged */
+ if (remoteExecType == EXEC_ON_NONE)
+ return result;
+
+ /* Only a remote Coordinator is allowed to send a query to backend nodes */
+ if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
+ {
+ RemoteQuery *step = makeNode(RemoteQuery);
+ step->combine_type = COMBINE_TYPE_SAME;
+ step->sql_statement = (char *) queryString;
+ step->exec_type = remoteExecType;
+ step->is_temp = is_temp;
+ result = lappend(result, step);
+ }
+
+ return result;
+}
+
+/*
+ * pgxc_query_contains_temp_tables
+ *
+ * Check if there is any temporary object used in given list of queries.
+ */
+bool
+pgxc_query_contains_temp_tables(List *queries)
+{
+ ListCell *elt;
+
+ foreach(elt, queries)
+ {
+ Query *query = (Query *) lfirst(elt);
+
+ if (!query)
+ continue;
+
+ switch(query->commandType)
+ {
+ case CMD_SELECT:
+ case CMD_UPDATE:
+ case CMD_INSERT:
+ case CMD_DELETE:
+ if (contains_temp_tables(query->rtable))
+ return true;
+ default:
+ break;
+ }
+ }
+
+ return false;
+}
+#endif
+
+
+#ifdef XCP
+/*
+ * AddRemoteQueryNode
+ *
+ * Add a Remote Query node to launch on Datanodes.
+ * This can only be done for a query a Top Level to avoid
+ * duplicated queries on Datanodes.
+ */
+List *
+AddRemoteQueryNode(List *stmts, const char *queryString, RemoteQueryExecType remoteExecType)
+{
+ List *result = stmts;
+
+ /* If node is appplied on EXEC_ON_NONE, simply return the list unchanged */
+ if (remoteExecType == EXEC_ON_NONE)
+ return result;
+
+ /* Only a remote Coordinator is allowed to send a query to backend nodes */
+ if (remoteExecType == EXEC_ON_CURRENT ||
+ (IS_PGXC_COORDINATOR && !IsConnFromCoord()))
+ {
+ RemoteQuery *step = makeNode(RemoteQuery);
+ step->combine_type = COMBINE_TYPE_SAME;
+ step->sql_statement = (char *) queryString;
+ step->exec_type = remoteExecType;
+ result = lappend(result, step);
+ }
+
+ return result;
+}
+#endif
+
+
+/*
+ * pgxc_direct_planner
+ * The routine tries to see if the statement can be completely evaluated on the
+ * datanodes. In such cases coordinator is not needed to evaluate the statement,
+ * and just acts as a proxy. A statement can be completely shipped to the remote
+ * node if every row of the result can be evaluated on a single datanode.
+ * For example:
+ *
+ * Only EXECUTE DIRECT statements are sent directly as of now
+ */
+PlannedStmt *
+pgxc_direct_planner(Query *query, int cursorOptions, ParamListInfo boundParams)
+{
+ PlannedStmt *result;
+ RemoteQuery *query_step;
+
+ /* build the PlannedStmt result */
+ result = makeNode(PlannedStmt);
+
+ /* Try and set what we can */
+ result->commandType = query->commandType;
+ result->canSetTag = query->canSetTag;
+ result->utilityStmt = query->utilityStmt;
+ result->rtable = query->rtable;
+
+ /* EXECUTE DIRECT statements have their RemoteQuery node already built when analyzing */
+ if (query->utilityStmt
+ && IsA(query->utilityStmt, RemoteQuery))
+ {
+ RemoteQuery *stmt = (RemoteQuery *) query->utilityStmt;
+ if (stmt->exec_direct_type != EXEC_DIRECT_NONE)
+ {
+ query_step = stmt;
+ query->utilityStmt = NULL;
+ result->utilityStmt = NULL;
+ }
+ }
+
+ /* Optimize multi-node handling */
+ query_step->read_only = query->commandType == CMD_SELECT;
+
+ result->planTree = (Plan *) query_step;
+
+#ifndef XCP
+ query->qry_finalise_aggs = false;
+#endif
+ query_step->scan.plan.targetlist = query->targetList;
+
+ return result;
+}
+
+#ifndef XCP
+/*
+ * pgxc_query_contains_utility
+ *
+ * Check if there is any utility statement in given list of queries.
+ */
+bool
+pgxc_query_contains_utility(List *queries)
+{
+ ListCell *elt;
+
+ foreach(elt, queries)
+ {
+ Query *query = (Query *) lfirst(elt);
+
+ if (!query)
+ continue;
+
+ if (query->commandType == CMD_UTILITY)
+ return true;
+ }
+
+ return false;
+}
+
+
+/*
+ * pgxc_set_remote_parameters
+ *
+ * Set the list of remote parameters for remote plan
+ */
+static void
+pgxc_set_remote_parameters(PlannedStmt *plan, ParamListInfo boundParams)
+{
+ Oid *param_types;
+ int cntParam, i;
+
+ /* Leave if no plan */
+ if (!plan)
+ return;
+
+ /* Leave if no parameters */
+ if (!boundParams)
+ return;
+
+ /*
+ * Count the number of remote parameters available
+ * We need to take into account all the parameters
+ * that are prior to the latest available. This insures
+ * that remote node will not complain about an incorrect
+ * number of parameter. In case parameters with no types
+ * are taken into account, they are considered as NULL entries.
+ */
+ cntParam = 0;
+ for (i = 0; i < boundParams->numParams; i++)
+ {
+ if (OidIsValid(boundParams->params[i].ptype))
+ cntParam = i + 1;
+ }
+
+ /* If there are no parameters available, simply leave */
+ if (cntParam == 0)
+ return;
+
+ param_types = (Oid *) palloc(sizeof(Oid) * cntParam);
+
+ /* Then fill the array of types */
+ for (i = 0; i < cntParam; i++)
+ param_types[i] = boundParams->params[i].ptype;
+
+ /* Finally save the parameters in plan */
+ SetRemoteStatementName(plan->planTree, NULL,
+ cntParam, param_types, 0);
+
+ return;
+}
+#endif
diff --git a/src/backend/pgxc/pool/Makefile b/src/backend/pgxc/pool/Makefile
index 019c756735..f3830be690 100644
--- a/src/backend/pgxc/pool/Makefile
+++ b/src/backend/pgxc/pool/Makefile
@@ -14,6 +14,6 @@ subdir = src/backend/pgxc/pool
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = pgxcnode.o execRemote.o poolmgr.o poolcomm.o poolutils.o
+OBJS = pgxcnode.o execRemote.o poolmgr.o poolcomm.o postgresql_fdw.o poolutils.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index f19eb0498f..14c75747c0 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -5,6 +5,11 @@
* Functions to execute commands on remote Datanodes
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
@@ -31,6 +36,11 @@
#include "libpq/libpq.h"
#include "miscadmin.h"
#include "pgxc/execRemote.h"
+#ifdef XCP
+#include "executor/nodeSubplan.h"
+#include "nodes/nodeFuncs.h"
+#include "pgstat.h"
+#endif
#include "nodes/nodes.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/var.h"
@@ -52,8 +62,17 @@
/* Enforce the use of two-phase commit when temporary objects are used */
bool EnforceTwoPhaseCommit = true;
-
+/*
+ * We do not want it too long, when query is terminating abnormally we just
+ * want to read in already available data, if datanode connection will reach a
+ * consistent state after that, we will go normal clean up procedure: send down
+ * ABORT etc., if data node is not responding we will signal pooler to drop
+ * the connection.
+ * It is better to drop and recreate datanode connection then wait for several
+ * seconds while it being cleaned up when, for example, cancelling query.
+ */
#define END_QUERY_TIMEOUT 20
+#ifndef XCP
#define ROLLBACK_RESP_LEN 9
typedef enum RemoteXactNodeStatus
@@ -108,6 +127,7 @@ typedef struct RemoteXactState
} RemoteXactState;
static RemoteXactState remoteXactState;
+#endif
#ifdef PGXC
typedef struct
@@ -124,6 +144,7 @@ typedef struct
#define COPY_BUFFER_SIZE 8192
#define PRIMARY_NODE_WRITEAHEAD 1024 * 1024
+#ifndef XCP
/*
* List of PGXCNodeHandle to track readers and writers involved in the
* current transaction
@@ -131,6 +152,7 @@ typedef struct
static List *XactWriteNodes;
static List *XactReadNodes;
static char *preparedNodes;
+#endif
/*
* Flag to track if a temporary object is accessed by the current transaction
@@ -148,39 +170,169 @@ static PGXCNodeAllHandles * get_exec_connections(RemoteQueryState *planstate,
ExecNodes *exec_nodes,
RemoteQueryExecType exec_type);
+#ifndef XCP
static void close_node_cursors(PGXCNodeHandle **connections, int conn_count, char *cursor);
static int pgxc_get_transaction_nodes(PGXCNodeHandle *connections[], int size, bool writeOnly);
static int pgxc_get_connections(PGXCNodeHandle *connections[], int size, List *connlist);
+#endif
static bool pgxc_start_command_on_connection(PGXCNodeHandle *connection,
RemoteQueryState *remotestate, Snapshot snapshot);
+#ifndef XCP
static TupleTableSlot * RemoteQueryNext(ScanState *node);
static bool RemoteQueryRecheck(RemoteQueryState *node, TupleTableSlot *slot);
-
static char *generate_begin_command(void);
-static bool pgxc_node_remote_prepare(char *prepareGID);
+#endif
+
+#ifdef XCP
+static char *pgxc_node_remote_prepare(char *prepareGID, bool localNode);
+static bool pgxc_node_remote_finish(char *prepareGID, bool commit,
+ char *nodestring, GlobalTransactionId gxid,
+ GlobalTransactionId prepare_gxid);
+#else
+static bool pgxc_node_remote_prepare(char *prepareGID, bool localNode);
+static char *pgxc_node_get_nodelist(bool localNode);
+#endif
static void pgxc_node_remote_commit(void);
static void pgxc_node_remote_abort(void);
-static char *pgxc_node_get_nodelist(bool localNode);
+#ifdef XCP
+static void pgxc_connections_cleanup(ResponseCombiner *combiner);
+static void pgxc_node_report_error(ResponseCombiner *combiner);
+#else
static void ExecClearTempObjectIncluded(void);
static void init_RemoteXactState(bool preparedLocalNode);
static void clear_RemoteXactState(void);
static void pgxc_node_report_error(RemoteQueryState *combiner);
-static TupleTableSlot *getrow_for_tapesort(RemoteQueryState *combiner,
- TupleTableSlot *scanslot);
-static bool IsReturningDMLOnReplicatedTable(RemoteQuery *rq);
-static void SetDataRowForIntParams(TupleTableSlot *slot, RemoteQueryState *rq_state);
+#endif
+
+#ifdef XCP
+#define REMOVE_CURR_CONN(combiner) \
+ if ((combiner)->current_conn < --((combiner)->conn_count)) \
+ { \
+ (combiner)->connections[(combiner)->current_conn] = \
+ (combiner)->connections[(combiner)->conn_count]; \
+ } \
+ else \
+ (combiner)->current_conn = 0
+#endif
+
+#define MAX_STATEMENTS_PER_TRAN 10
+
+/* Variables to collect statistics */
+static int total_transactions = 0;
+static int total_statements = 0;
+static int total_autocommit = 0;
+static int nonautocommit_2pc = 0;
+static int autocommit_2pc = 0;
+static int current_tran_statements = 0;
+static int *statements_per_transaction = NULL;
+static int *nodes_per_transaction = NULL;
+
+/*
+ * statistics collection: count a statement
+ */
+static void
+stat_statement()
+{
+ total_statements++;
+ current_tran_statements++;
+}
+
+/*
+ * To collect statistics: count a transaction
+ */
+static void
+stat_transaction(int node_count)
+{
+ total_transactions++;
+
+ if (!statements_per_transaction)
+ {
+ statements_per_transaction = (int *) malloc((MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+ memset(statements_per_transaction, 0, (MAX_STATEMENTS_PER_TRAN + 1) * sizeof(int));
+ }
+ if (current_tran_statements > MAX_STATEMENTS_PER_TRAN)
+ statements_per_transaction[MAX_STATEMENTS_PER_TRAN]++;
+ else
+ statements_per_transaction[current_tran_statements]++;
+ current_tran_statements = 0;
+ if (node_count > 0 && node_count <= NumDataNodes)
+ {
+ if (!nodes_per_transaction)
+ {
+ nodes_per_transaction = (int *) malloc(NumDataNodes * sizeof(int));
+ memset(nodes_per_transaction, 0, NumDataNodes * sizeof(int));
+ }
+ nodes_per_transaction[node_count - 1]++;
+ }
+}
+
+
+#ifdef NOT_USED
+/*
+ * To collect statistics: count a two-phase commit on nodes
+ */
+static void
+stat_2pc()
+{
+ if (autocommit)
+ autocommit_2pc++;
+ else
+ nonautocommit_2pc++;
+}
+#endif
+
+
+/*
+ * Output collected statistics to the log
+ */
+static void
+stat_log()
+{
+ elog(DEBUG1, "Total Transactions: %d Total Statements: %d", total_transactions, total_statements);
+ elog(DEBUG1, "Autocommit: %d 2PC for Autocommit: %d 2PC for non-Autocommit: %d",
+ total_autocommit, autocommit_2pc, nonautocommit_2pc);
+ if (total_transactions)
+ {
+ if (statements_per_transaction)
+ {
+ int i;
+
+ for (i = 0; i < MAX_STATEMENTS_PER_TRAN; i++)
+ elog(DEBUG1, "%d Statements per Transaction: %d (%d%%)",
+ i, statements_per_transaction[i], statements_per_transaction[i] * 100 / total_transactions);
+ }
+ elog(DEBUG1, "%d+ Statements per Transaction: %d (%d%%)",
+ MAX_STATEMENTS_PER_TRAN, statements_per_transaction[MAX_STATEMENTS_PER_TRAN], statements_per_transaction[MAX_STATEMENTS_PER_TRAN] * 100 / total_transactions);
+ if (nodes_per_transaction)
+ {
+ int i;
+
+ for (i = 0; i < NumDataNodes; i++)
+ elog(DEBUG1, "%d Nodes per Transaction: %d (%d%%)",
+ i + 1, nodes_per_transaction[i], nodes_per_transaction[i] * 100 / total_transactions);
+ }
+ }
+}
+
/*
* Create a structure to store parameters needed to combine responses from
* multiple connections as well as state information
*/
+#ifdef XCP
+void
+InitResponseCombiner(ResponseCombiner *combiner, int node_count,
+ CombineType combine_type)
+#else
static RemoteQueryState *
CreateResponseCombiner(int node_count, CombineType combine_type)
+#endif
{
+#ifndef XCP
RemoteQueryState *combiner;
/* ResponseComber is a typedef for pointer to ResponseCombinerData */
@@ -190,32 +342,52 @@ CreateResponseCombiner(int node_count, CombineType combine_type)
/* Out of memory */
return combiner;
}
-
+#endif
combiner->node_count = node_count;
combiner->connections = NULL;
combiner->conn_count = 0;
combiner->combine_type = combine_type;
combiner->command_complete_count = 0;
combiner->request_type = REQUEST_TYPE_NOT_DEFINED;
- combiner->tuple_desc = NULL;
combiner->description_count = 0;
combiner->copy_in_count = 0;
combiner->copy_out_count = 0;
+ combiner->copy_file = NULL;
combiner->errorMessage = NULL;
combiner->errorDetail = NULL;
- combiner->query_Done = false;
+ combiner->tuple_desc = NULL;
+#ifdef XCP
+ combiner->probing_primary = false;
+ combiner->returning_node = InvalidOid;
+ combiner->currentRow = NULL;
+#else
combiner->currentRow.msg = NULL;
combiner->currentRow.msglen = 0;
combiner->currentRow.msgnode = 0;
+#endif
combiner->rowBuffer = NIL;
combiner->tapenodes = NULL;
+#ifdef XCP
+ combiner->merge_sort = false;
+ combiner->extended_query = false;
+ combiner->tapemarks = NULL;
+ combiner->tuplesortstate = NULL;
+ combiner->cursor = NULL;
+ combiner->update_cursor = NULL;
+ combiner->cursor_count = 0;
+ combiner->cursor_connections = NULL;
combiner->remoteCopyType = REMOTE_COPY_NONE;
+#else
+ combiner->initAggregates = true;
+ combiner->query_Done = false;
combiner->copy_file = NULL;
combiner->rqs_cmd_id = FirstCommandId;
return combiner;
+#endif
}
+
/*
* Parse out row count from the command status response and convert it to integer
*/
@@ -311,8 +483,16 @@ create_tuple_desc(char *msg_body, size_t len)
* Handle CopyOutCommandComplete ('c') message from a Datanode connection
*/
static void
+#ifdef XCP
+HandleCopyOutComplete(ResponseCombiner *combiner)
+#else
HandleCopyOutComplete(RemoteQueryState *combiner)
+#endif
{
+#ifdef XCP
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+#endif
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
combiner->request_type = REQUEST_TYPE_COPY_OUT;
if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
@@ -328,7 +508,11 @@ HandleCopyOutComplete(RemoteQueryState *combiner)
* Handle CommandComplete ('C') message from a Datanode connection
*/
static void
+#ifdef XCP
+HandleCommandComplete(ResponseCombiner *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
+#else
HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len, PGXCNodeHandle *conn)
+#endif
{
int digits = 0;
EState *estate = combiner->ss.ps.state;
@@ -350,11 +534,22 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len, PG
{
if (combiner->command_complete_count)
{
+#ifdef XCP
/*
- * For comments on why non_fqs_dml is required
- * see comments in ExecProcNodeDMLInXC
+ * Replicated command may succeed on on node and fail on
+ * another. The example is if distributed table referenced
+ * by a foreign key constraint defined on a partitioned
+ * table. If command deletes rows from the replicated table
+ * they may be referenced on one Datanode but not on other.
+ * So, replicated command on each Datanode either affects
+ * proper number of rows, or returns error. Here if
+ * combiner got an error already, we allow to report it,
+ * not the scaring data corruption message.
*/
- if (rowcount != estate->es_processed && !combiner->non_fqs_dml)
+ if (combiner->errorMessage == NULL && rowcount != estate->es_processed)
+#else
+ if (rowcount != estate->es_processed)
+#endif
/* There is a consistency issue in the database with the replicated table */
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
@@ -362,19 +557,53 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len, PG
}
else
/* first result */
- if (!combiner->non_fqs_dml)
- estate->es_processed = rowcount;
+ estate->es_processed = rowcount;
}
else
- if (!combiner->non_fqs_dml)
- estate->es_processed += rowcount;
+ estate->es_processed += rowcount;
}
else
combiner->combine_type = COMBINE_TYPE_NONE;
}
/* If response checking is enable only then do further processing */
-
+#ifdef XCP
+ if (conn->ck_resp_rollback)
+ {
+ if (strcmp(msg_body, "ROLLBACK") == 0)
+ {
+ /*
+ * Subsequent clean up routine will be checking this flag
+ * to determine nodes where to send ROLLBACK PREPARED.
+ * On current node PREPARE has failed and the two-phase record
+ * does not exist, so clean this flag as if PREPARE was not sent
+ * to that node and avoid erroneous command.
+ */
+ conn->ck_resp_rollback = false;
+ /*
+ * Set the error, if none, to force throwing.
+ * If there is error already, it will be thrown anyway, do not add
+ * this potentially confusing message
+ */
+ if (combiner->errorMessage == NULL)
+ {
+ combiner->errorMessage =
+ pstrdup("unexpected ROLLBACK from remote node");
+ /*
+ * ERRMSG_PRODUCER_ERROR
+ * Messages with this code are replaced by others, if they are
+ * received, so if node will send relevant error message that
+ * one will be replaced.
+ */
+ combiner->errorCode[0] = 'X';
+ combiner->errorCode[1] = 'X';
+ combiner->errorCode[2] = '0';
+ combiner->errorCode[3] = '1';
+ combiner->errorCode[4] = '0';
+ }
+ }
+ }
+#else
if (conn->ck_resp_rollback == RESP_ROLLBACK_CHECK)
{
conn->ck_resp_rollback = RESP_ROLLBACK_NOT_RECEIVED;
@@ -384,6 +613,7 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len, PG
conn->ck_resp_rollback = RESP_ROLLBACK_RECEIVED;
}
}
+#endif
combiner->command_complete_count++;
}
@@ -392,8 +622,16 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len, PG
* Handle RowDescription ('T') message from a Datanode connection
*/
static bool
+#ifdef XCP
+HandleRowDescription(ResponseCombiner *combiner, char *msg_body, size_t len)
+#else
HandleRowDescription(RemoteQueryState *combiner, char *msg_body, size_t len)
+#endif
{
+#ifdef XCP
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return false;
+#endif
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
combiner->request_type = REQUEST_TYPE_QUERY;
if (combiner->request_type != REQUEST_TYPE_QUERY)
@@ -441,8 +679,16 @@ HandleParameterStatus(RemoteQueryState *combiner, char *msg_body, size_t len)
* Handle CopyInResponse ('G') message from a Datanode connection
*/
static void
+#ifdef XCP
+HandleCopyIn(ResponseCombiner *combiner)
+#else
HandleCopyIn(RemoteQueryState *combiner)
+#endif
{
+#ifdef XCP
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+#endif
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
combiner->request_type = REQUEST_TYPE_COPY_IN;
if (combiner->request_type != REQUEST_TYPE_COPY_IN)
@@ -463,8 +709,16 @@ HandleCopyIn(RemoteQueryState *combiner)
* Handle CopyOutResponse ('H') message from a Datanode connection
*/
static void
+#ifdef XCP
+HandleCopyOut(ResponseCombiner *combiner)
+#else
HandleCopyOut(RemoteQueryState *combiner)
+#endif
{
+#ifdef XCP
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+#endif
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
combiner->request_type = REQUEST_TYPE_COPY_OUT;
if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
@@ -485,8 +739,16 @@ HandleCopyOut(RemoteQueryState *combiner)
* Handle CopyOutDataRow ('d') message from a Datanode connection
*/
static void
+#ifdef XCP
+HandleCopyDataRow(ResponseCombiner *combiner, char *msg_body, size_t len)
+#else
HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len)
+#endif
{
+#ifdef XCP
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return;
+#endif
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
combiner->request_type = REQUEST_TYPE_COPY_OUT;
@@ -511,6 +773,9 @@ HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len)
pq_putmessage('d', msg_body, len);
break;
case REMOTE_COPY_TUPLESTORE:
+#ifdef XCP
+ tuplestore_putmessage(combiner->tuplestorestate, len, msg_body);
+#else
{
Datum *values;
bool *nulls;
@@ -586,6 +851,7 @@ HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len)
pfree(in_functions);
pfree(typioparams);
}
+#endif
break;
case REMOTE_COPY_NONE:
default:
@@ -595,9 +861,67 @@ HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len)
/*
* Handle DataRow ('D') message from a Datanode connection
- * The function returns true if buffer can accept more data rows.
- * Caller must stop reading if function returns false
+ * The function returns true if data row is accepted and successfully stored
+ * within the combiner.
*/
+#ifdef XCP
+static bool
+HandleDataRow(ResponseCombiner *combiner, char *msg_body, size_t len, Oid node)
+{
+ /* We expect previous message is consumed */
+ Assert(combiner->currentRow == NULL);
+
+ if (combiner->request_type == REQUEST_TYPE_ERROR)
+ return false;
+
+ if (combiner->request_type != REQUEST_TYPE_QUERY)
+ {
+ /* Inconsistent responses */
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the data nodes for 'D' message, current request type %d", combiner->request_type)));
+ }
+
+ /*
+ * If we got an error already ignore incoming data rows from other nodes
+ * Still we want to continue reading until get CommandComplete
+ */
+ if (combiner->errorMessage)
+ return false;
+
+ /*
+ * Replicated INSERT/UPDATE/DELETE with RETURNING: receive only tuples
+ * from one node, skip others as duplicates
+ */
+ if (combiner->combine_type == COMBINE_TYPE_SAME)
+ {
+ /* Do not return rows when probing primary, instead return when doing
+ * first normal node. Just save some CPU and traffic in case if
+ * probing fails.
+ */
+ if (combiner->probing_primary)
+ return false;
+ if (OidIsValid(combiner->returning_node))
+ {
+ if (combiner->returning_node != node)
+ return false;
+ }
+ else
+ combiner->returning_node = node;
+ }
+
+ /*
+ * We are copying message because it points into connection buffer, and
+ * will be overwritten on next socket read
+ */
+ combiner->currentRow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + len);
+ memcpy(combiner->currentRow->msg, msg_body, len);
+ combiner->currentRow->msglen = len;
+ combiner->currentRow->msgnode = node;
+
+ return true;
+}
+#else
static void
HandleDataRow(RemoteQueryState *combiner, char *msg_body, size_t len, int nid)
{
@@ -634,12 +958,17 @@ HandleDataRow(RemoteQueryState *combiner, char *msg_body, size_t len, int nid)
combiner->currentRow.msglen = len;
combiner->currentRow.msgnode = nid;
}
+#endif
/*
* Handle ErrorResponse ('E') message from a Datanode connection
*/
static void
+#ifdef XCP
+HandleError(ResponseCombiner *combiner, char *msg_body, size_t len)
+#else
HandleError(RemoteQueryState *combiner, char *msg_body, size_t len)
+#endif
{
/* parse error message */
char *code = NULL;
@@ -692,6 +1021,24 @@ HandleError(RemoteQueryState *combiner, char *msg_body, size_t len)
* ReadyForQuery is received, so we just store the error message.
* If multiple connections return errors only first one is reported.
*/
+#ifdef XCP
+ /*
+ * The producer error may be hiding primary error, so if previously received
+ * error is a producer error allow it to be overwritten.
+ */
+ if (combiner->errorMessage == NULL ||
+ MAKE_SQLSTATE(combiner->errorCode[0], combiner->errorCode[1],
+ combiner->errorCode[2], combiner->errorCode[3],
+ combiner->errorCode[4]) == ERRCODE_PRODUCER_ERROR)
+ {
+ combiner->errorMessage = pstrdup(message);
+ /* Error Code is exactly 5 significant bytes */
+ if (code)
+ memcpy(combiner->errorCode, code, 5);
+ if (detail)
+ combiner->errorDetail = pstrdup(detail);
+ }
+#else
if (!combiner->errorMessage)
{
combiner->errorMessage = pstrdup(message);
@@ -704,6 +1051,7 @@ HandleError(RemoteQueryState *combiner, char *msg_body, size_t len)
{
combiner->errorDetail = pstrdup(detail);
}
+#endif
/*
* If Datanode have sent ErrorResponse it will never send CommandComplete.
@@ -791,8 +1139,13 @@ HandleCmdComplete(CmdType commandType, CombineTag *combine,
/*
* HandleDatanodeCommandId ('M') message from a Datanode connection
*/
+#ifdef XCP
+static void
+HandleDatanodeCommandId(ResponseCombiner *combiner, char *msg_body, size_t len)
+#else
static void
HandleDatanodeCommandId(RemoteQueryState *combiner, char *msg_body, size_t len)
+#endif
{
uint32 n32;
CommandId cid;
@@ -814,7 +1167,11 @@ HandleDatanodeCommandId(RemoteQueryState *combiner, char *msg_body, size_t len)
* successfully
*/
static bool
+#ifdef XCP
+validate_combiner(ResponseCombiner *combiner)
+#else
validate_combiner(RemoteQueryState *combiner)
+#endif
{
/* There was error message while combining */
if (combiner->errorMessage)
@@ -853,6 +1210,24 @@ validate_combiner(RemoteQueryState *combiner)
/*
* Close combiner and free allocated memory, if it is not needed
*/
+#ifdef XCP
+void
+CloseCombiner(ResponseCombiner *combiner)
+{
+ if (combiner->connections)
+ pfree(combiner->connections);
+ if (combiner->tuple_desc)
+ FreeTupleDesc(combiner->tuple_desc);
+ if (combiner->errorMessage)
+ pfree(combiner->errorMessage);
+ if (combiner->cursor_connections)
+ pfree(combiner->cursor_connections);
+ if (combiner->tapenodes)
+ pfree(combiner->tapenodes);
+ if (combiner->tapemarks)
+ pfree(combiner->tapemarks);
+}
+#else
static void
CloseCombiner(RemoteQueryState *combiner)
{
@@ -881,12 +1256,17 @@ CloseCombiner(RemoteQueryState *combiner)
pfree(combiner);
}
}
+#endif
/*
* Validate combiner and release storage freeing allocated memory
*/
static bool
+#ifdef XCP
+ValidateAndCloseCombiner(ResponseCombiner *combiner)
+#else
ValidateAndCloseCombiner(RemoteQueryState *combiner)
+#endif
{
bool valid = validate_combiner(combiner);
@@ -911,6 +1291,171 @@ ValidateAndCloseCombiner(RemoteQueryState *combiner)
* points to the original RemoteQueryState. If combiner differs from "this" the
* connection should be buffered.
*/
+#ifdef XCP
+void
+BufferConnection(PGXCNodeHandle *conn)
+{
+ ResponseCombiner *combiner = conn->combiner;
+ MemoryContext oldcontext;
+
+ if (combiner == NULL || conn->state != DN_CONNECTION_STATE_QUERY)
+ return;
+
+ elog(LOG, "Buffer connection %u to step %s", conn->nodeoid, combiner->cursor);
+
+ /*
+ * When BufferConnection is invoked CurrentContext is related to other
+ * portal, which is trying to control the connection.
+ * TODO See if we can find better context to switch to
+ */
+ oldcontext = MemoryContextSwitchTo(combiner->ss.ps.ps_ResultTupleSlot->tts_mcxt);
+
+ /* Verify the connection is in use by the combiner */
+ combiner->current_conn = 0;
+ while (combiner->current_conn < combiner->conn_count)
+ {
+ if (combiner->connections[combiner->current_conn] == conn)
+ break;
+ combiner->current_conn++;
+ }
+ Assert(combiner->current_conn < combiner->conn_count);
+
+ if (combiner->tapemarks == NULL)
+ combiner->tapemarks = (ListCell**) palloc0(combiner->conn_count * sizeof(ListCell*));
+
+ /*
+ * If current bookmark for the current tape is not set it means either
+ * first row in the buffer is from the current tape or no rows from
+ * the tape in the buffer, so if first row is not from current
+ * connection bookmark the last cell in the list.
+ */
+ if (combiner->tapemarks[combiner->current_conn] == NULL &&
+ list_length(combiner->rowBuffer) > 0)
+ {
+ RemoteDataRow dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
+ if (dataRow->msgnode != conn->nodeoid)
+ combiner->tapemarks[combiner->current_conn] = list_tail(combiner->rowBuffer);
+ }
+
+ /*
+ * Buffer data rows until data node return number of rows specified by the
+ * fetch_size parameter of last Execute message (PortalSuspended message)
+ * or end of result set is reached (CommandComplete message)
+ */
+ while (true)
+ {
+ int res;
+
+ /* Move to buffer currentRow (received from the data node) */
+ if (combiner->currentRow)
+ {
+ combiner->rowBuffer = lappend(combiner->rowBuffer,
+ combiner->currentRow);
+ combiner->currentRow = NULL;
+ }
+
+ res = handle_response(conn, combiner);
+ /*
+ * If response message is a DataRow it will be handled on the next
+ * iteration.
+ * PortalSuspended will cause connection state change and break the loop
+ * The same is for CommandComplete, but we need additional handling -
+ * remove connection from the list of active connections.
+ * We may need to add handling error response
+ */
+
+ /* Most often result check first */
+ if (res == RESPONSE_DATAROW)
+ {
+ /*
+ * The row is in the combiner->currentRow, on next iteration it will
+ * be moved to the buffer
+ */
+ continue;
+ }
+
+ /* incomplete message, read more */
+ if (res == RESPONSE_EOF)
+ {
+ if (pgxc_node_receive(1, &conn, NULL))
+ {
+ conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ add_error_message(conn, "Failed to fetch from data node");
+ }
+ }
+
+ /*
+ * End of result set is reached, so either set the pointer to the
+ * connection to NULL (combiner with sort) or remove it from the list
+ * (combiner without sort)
+ */
+ else if (res == RESPONSE_COMPLETE)
+ {
+ /*
+ * If combiner is doing merge sort we should set reference to the
+ * current connection to NULL in the array, indicating the end
+ * of the tape is reached. FetchTuple will try to access the buffer
+ * first anyway.
+ * Since we remove that reference we can not determine what node
+ * number was this connection, but we need this info to find proper
+ * tuple in the buffer if we are doing merge sort. So store node
+ * number in special array.
+ * NB: We can not test if combiner->tuplesortstate is set here:
+ * connection may require buffering inside tuplesort_begin_merge
+ * - while pre-read rows from the tapes, one of the tapes may be
+ * the local connection with RemoteSubplan in the tree. The
+ * combiner->tuplesortstate is set only after tuplesort_begin_merge
+ * returns.
+ */
+ if (combiner->merge_sort)
+ {
+ combiner->connections[combiner->current_conn] = NULL;
+ if (combiner->tapenodes == NULL)
+ combiner->tapenodes = (Oid *)
+ palloc0(combiner->conn_count * sizeof(Oid));
+ combiner->tapenodes[combiner->current_conn] = conn->nodeoid;
+ }
+ else
+ {
+ /* Remove current connection, move last in-place, adjust current_conn */
+ if (combiner->current_conn < --combiner->conn_count)
+ combiner->connections[combiner->current_conn] = combiner->connections[combiner->conn_count];
+ else
+ combiner->current_conn = 0;
+ }
+ /*
+ * If combiner runs Simple Query Protocol we need to read in
+ * ReadyForQuery. In case of Extended Query Protocol it is not
+ * sent and we should quit.
+ */
+ if (combiner->extended_query)
+ break;
+ }
+ else if (res == RESPONSE_ERROR)
+ {
+ if (combiner->extended_query)
+ {
+ /*
+ * Need to sync connection to enable receiving commands
+ * by the datanode
+ */
+ if (pgxc_node_send_sync(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node")));
+ }
+ }
+ else if (res == RESPONSE_SUSPENDED || res == RESPONSE_READY)
+ {
+ /* Now it is OK to quit */
+ break;
+ }
+ }
+ Assert(conn->state != DN_CONNECTION_STATE_QUERY);
+ MemoryContextSwitchTo(oldcontext);
+ conn->combiner = NULL;
+}
+#else
void
BufferConnection(PGXCNodeHandle *conn)
{
@@ -982,7 +1527,7 @@ BufferConnection(PGXCNodeHandle *conn)
* connection to NULL (step with sort) or remove it from the list
* (step without sort)
*/
- if (combiner->rqs_for_sort)
+ if (combiner->tuplesortstate)
{
combiner->connections[combiner->current_conn] = NULL;
if (combiner->tapenodes == NULL)
@@ -1008,11 +1553,29 @@ BufferConnection(PGXCNodeHandle *conn)
MemoryContextSwitchTo(oldcontext);
conn->combiner = NULL;
}
+#endif
/*
* copy the datarow from combiner to the given slot, in the slot's memory
* context
*/
+#ifdef XCP
+static void
+CopyDataRowTupleToSlot(ResponseCombiner *combiner, TupleTableSlot *slot)
+{
+ RemoteDataRow datarow;
+ MemoryContext oldcontext;
+ oldcontext = MemoryContextSwitchTo(slot->tts_mcxt);
+ datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + combiner->currentRow->msglen);
+ datarow->msgnode = combiner->currentRow->msgnode;
+ datarow->msglen = combiner->currentRow->msglen;
+ memcpy(datarow->msg, combiner->currentRow->msg, datarow->msglen);
+ ExecStoreDataRowTuple(datarow, slot, true);
+ pfree(combiner->currentRow);
+ combiner->currentRow = NULL;
+ MemoryContextSwitchTo(oldcontext);
+}
+#else
static void
CopyDataRowTupleToSlot(RemoteQueryState *combiner, TupleTableSlot *slot)
{
@@ -1028,7 +1591,349 @@ CopyDataRowTupleToSlot(RemoteQueryState *combiner, TupleTableSlot *slot)
combiner->currentRow.msgnode = 0;
MemoryContextSwitchTo(oldcontext);
}
+#endif
+
+
+#ifdef XCP
+/*
+ * FetchTuple
+ *
+ Get next tuple from one of the datanode connections.
+ * The connections should be in combiner->connections, if "local" dummy
+ * connection presents it should be the last active connection in the array.
+ * If combiner is set up to perform merge sort function returns tuple from
+ * connection defined by combiner->current_conn, or NULL slot if no more tuple
+ * are available from the connection. Otherwise it returns tuple from any
+ * connection or NULL slot if no more available connections.
+ * Function looks into combiner->rowBuffer before accessing connection
+ * and return a tuple from there if found.
+ * Function may wait while more data arrive from the data nodes. If there
+ * is a locally executed subplan function advance it and buffer resulting rows
+ * instead of waiting.
+ */
+TupleTableSlot *
+FetchTuple(ResponseCombiner *combiner)
+{
+ PGXCNodeHandle *conn;
+ TupleTableSlot *slot;
+ Oid nodeOid = -1;
+
+ /*
+ * Case if we run local subplan.
+ * We do not have remote connections, so just get local tuple and return it
+ */
+ if (outerPlanState(combiner))
+ {
+ RemoteSubplanState *planstate = (RemoteSubplanState *) combiner;
+ RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ /* Advance subplan in a loop until we have something to return */
+ for (;;)
+ {
+ Datum value = NULL;
+ bool isnull;
+ int numnodes;
+ int i;
+
+ slot = ExecProcNode(outerPlanState(combiner));
+ /* If locator is not defined deliver all the results */
+ if (planstate->locator == NULL)
+ return slot;
+
+ /*
+ * If NULL tuple is returned we done with the subplan, finish it up and
+ * return NULL
+ */
+ if (TupIsNull(slot))
+ return NULL;
+
+ /* Get partitioning value if defined */
+ if (plan->distributionKey != InvalidAttrNumber)
+ value = slot_getattr(slot, plan->distributionKey, &isnull);
+
+ /* Determine target nodes */
+ numnodes = GET_NODES(planstate->locator, value, isnull, NULL);
+ for (i = 0; i < numnodes; i++)
+ {
+ /* Deliver the node */
+ if (planstate->dest_nodes[i] == PGXCNodeId-1)
+ return slot;
+ }
+ }
+ }
+
+ /*
+ * Get current connection
+ */
+ if (combiner->conn_count > combiner->current_conn)
+ conn = combiner->connections[combiner->current_conn];
+ else
+ conn = NULL;
+
+ /*
+ * If doing merge sort determine the node number.
+ * It may be needed to get buffered row.
+ */
+ if (combiner->merge_sort)
+ {
+ Assert(conn || combiner->tapenodes);
+ nodeOid = conn ? conn->nodeoid :
+ combiner->tapenodes[combiner->current_conn];
+ Assert(OidIsValid(nodeOid));
+ }
+ /*
+ * First look into the row buffer.
+ * When we are performing merge sort we need to get from the buffer record
+ * from the connection marked as "current". Otherwise get first.
+ */
+ if (list_length(combiner->rowBuffer) > 0)
+ {
+ RemoteDataRow dataRow;
+
+ Assert(combiner->currentRow == NULL);
+
+ if (combiner->merge_sort)
+ {
+ ListCell *lc;
+ ListCell *prev;
+
+ elog(LOG, "Getting buffered tuple from node %x", nodeOid);
+
+ prev = combiner->tapemarks[combiner->current_conn];
+ if (prev)
+ {
+ /*
+ * Start looking through the list from the bookmark.
+ * Probably the first cell we check contains row from the needed
+ * node. Otherwise continue scanning until we encounter one,
+ * advancing prev pointer as well.
+ */
+ while((lc = lnext(prev)) != NULL)
+ {
+ dataRow = (RemoteDataRow) lfirst(lc);
+ if (dataRow->msgnode == nodeOid)
+ {
+ combiner->currentRow = dataRow;
+ break;
+ }
+ prev = lc;
+ }
+ }
+ else
+ {
+ /*
+ * Either needed row is the first in the buffer or no such row
+ */
+ lc = list_head(combiner->rowBuffer);
+ dataRow = (RemoteDataRow) lfirst(lc);
+ if (dataRow->msgnode == nodeOid)
+ combiner->currentRow = dataRow;
+ else
+ lc = NULL;
+ }
+ if (lc)
+ {
+ /*
+ * Delete cell from the buffer. Before we delete we must check
+ * the bookmarks, if the cell is a bookmark for any tape.
+ * If it is the case we are deleting last row of the current
+ * block from the current tape. That tape should have bookmark
+ * like current, and current bookmark will be advanced when we
+ * read the tape once again.
+ */
+ int i;
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ if (combiner->tapemarks[i] == lc)
+ combiner->tapemarks[i] = prev;
+ }
+ elog(LOG, "Found buffered tuple from node %x", nodeOid);
+ combiner->rowBuffer = list_delete_cell(combiner->rowBuffer,
+ lc, prev);
+ }
+ elog(LOG, "Update tapemark");
+ combiner->tapemarks[combiner->current_conn] = prev;
+ }
+ else
+ {
+ dataRow = (RemoteDataRow) linitial(combiner->rowBuffer);
+ combiner->currentRow = dataRow;
+ combiner->rowBuffer = list_delete_first(combiner->rowBuffer);
+ }
+ }
+
+ /* If we have node message in the currentRow slot, and it is from a proper
+ * node, consume it. */
+ if (combiner->currentRow)
+ {
+ Assert(!combiner->merge_sort ||
+ combiner->currentRow->msgnode == nodeOid);
+ slot = combiner->ss.ps.ps_ResultTupleSlot;
+ CopyDataRowTupleToSlot(combiner, slot);
+ return slot;
+ }
+
+ while (conn)
+ {
+ int res;
+
+ /* Going to use a connection, buffer it if needed */
+ CHECK_OWNERSHIP(conn, combiner);
+
+ /*
+ * If current connection is idle it means portal on the data node is
+ * suspended. Request more and try to get it
+ */
+ if (combiner->extended_query &&
+ conn->state == DN_CONNECTION_STATE_IDLE)
+ {
+ /*
+ * We do not allow to suspend if querying primary node, so that
+ * only may mean the current node is secondary and subplan was not
+ * executed there yet. Return and go on with second phase.
+ */
+ if (combiner->probing_primary)
+ return NULL;
+ if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node")));
+ if (pgxc_node_send_flush(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node")));
+ if (pgxc_node_receive(1, &conn, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node")));
+ }
+
+ /* read messages */
+ res = handle_response(conn, combiner);
+ if (res == RESPONSE_DATAROW)
+ {
+ slot = combiner->ss.ps.ps_ResultTupleSlot;
+ CopyDataRowTupleToSlot(combiner, slot);
+ return slot;
+ }
+ else if (res == RESPONSE_EOF)
+ {
+ /* incomplete message, read more */
+ if (pgxc_node_receive(1, &conn, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node")));
+ continue;
+ }
+ else if (res == RESPONSE_SUSPENDED)
+ {
+ /*
+ * If we are doing merge sort or probing primary node we should
+ * remain on the same node, so query next portion immediately.
+ * Otherwise leave node suspended and fetch lazily.
+ */
+ if (combiner->merge_sort || combiner->probing_primary)
+ {
+ if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node")));
+ if (pgxc_node_send_flush(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node")));
+ if (pgxc_node_receive(1, &conn, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node")));
+ continue;
+ }
+ if (++combiner->current_conn >= combiner->conn_count)
+ combiner->current_conn = 0;
+ conn = combiner->connections[combiner->current_conn];
+ }
+ else if (res == RESPONSE_COMPLETE)
+ {
+ /*
+ * In case of Simple Query Protocol we should receive ReadyForQuery
+ * before removing connection from the list. In case of Extended
+ * Query Protocol we may remove connection right away.
+ */
+ if (combiner->extended_query)
+ {
+ /* If we are doing merge sort clean current connection and return
+ * NULL, otherwise remove current connection, move last in-place,
+ * adjust current_conn and continue if it is not last connection */
+ if (combiner->merge_sort)
+ {
+ combiner->connections[combiner->current_conn] = NULL;
+ return NULL;
+ }
+ REMOVE_CURR_CONN(combiner);
+ if (combiner->conn_count > 0)
+ conn = combiner->connections[combiner->current_conn];
+ else
+ return NULL;
+ }
+ }
+ else if (res == RESPONSE_ERROR)
+ {
+ /*
+ * If doing Extended Query Protocol we need to sync connection,
+ * otherwise subsequent commands will be ignored.
+ */
+ if (combiner->extended_query)
+ {
+ if (pgxc_node_send_sync(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node")));
+ }
+ /*
+ * Do not wait for response from primary, it needs to wait
+ * for other nodes to respond. Instead go ahead and send query to
+ * other nodes. It will fail there, but we can continue with
+ * normal cleanup.
+ */
+ if (combiner->probing_primary)
+ {
+ REMOVE_CURR_CONN(combiner);
+ return NULL;
+ }
+ }
+ else if (res == RESPONSE_READY)
+ {
+ /* If we are doing merge sort clean current connection and return
+ * NULL, otherwise remove current connection, move last in-place,
+ * adjust current_conn and continue if it is not last connection */
+ if (combiner->merge_sort)
+ {
+ combiner->connections[combiner->current_conn] = NULL;
+ return NULL;
+ }
+ REMOVE_CURR_CONN(combiner);
+ if (combiner->conn_count > 0)
+ conn = combiner->connections[combiner->current_conn];
+ else
+ return NULL;
+ }
+ else if (res == RESPONSE_TUPDESC)
+ {
+ ExecSetSlotDescriptor(combiner->ss.ps.ps_ResultTupleSlot,
+ combiner->tuple_desc);
+ /* Now slot is responsible for freeng the descriptor */
+ combiner->tuple_desc = NULL;
+ }
+ else
+ {
+ // Can not get here?
+ Assert(false);
+ }
+ }
+
+ return NULL;
+}
+#else
/*
* Get next data row from the combiner's buffer into provided slot
* Just clear slot and return false if buffer is empty, that means end of result
@@ -1039,12 +1944,6 @@ FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot)
{
bool have_tuple = false;
- /*
- * We don't expect the RemoteQuery feeding a sort to come this way. As of
- * now, such a RemoteQuery gets the rows as dictated by the Sort plan above,
- * hence fetches the rows on its own.
- */
- Assert(!combiner->rqs_for_sort);
/* If we have message in the buffer, consume it */
if (combiner->currentRow.msg)
{
@@ -1053,6 +1952,15 @@ FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot)
}
/*
+ * If this is ordered fetch we can not know what is the node
+ * to handle next, so sorter will choose next itself and set it as
+ * currentRow to have it consumed on the next call to FetchTuple.
+ * Otherwise allow to prefetch next tuple
+ */
+ if (((RemoteQuery *)combiner->ss.ps.plan)->sort)
+ return have_tuple;
+
+ /*
* Note: If we are fetching not sorted results we can not have both
* currentRow and buffered rows. When connection is buffered currentRow
* is moved to buffer, and then it is cleaned after buffering is
@@ -1155,6 +2063,15 @@ FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot)
CopyDataRowTupleToSlot(combiner, slot);
have_tuple = true;
}
+
+ /*
+ * If this is ordered fetch we can not know what is the node
+ * to handle next, so sorter will choose next itself and set it as
+ * currentRow to have it consumed on the next call to FetchTuple.
+ * Otherwise allow to prefetch next tuple
+ */
+ if (((RemoteQuery *)combiner->ss.ps.plan)->sort)
+ return have_tuple;
}
/* report end of data to the caller */
@@ -1163,14 +2080,20 @@ FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot)
return have_tuple;
}
+#endif
/*
* Handle responses from the Datanode connections
*/
static int
+#ifdef XCP
+pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
+ struct timeval * timeout, ResponseCombiner *combiner)
+#else
pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
struct timeval * timeout, RemoteQueryState *combiner)
+#endif
{
int count = conn_count;
PGXCNodeHandle *to_receive[conn_count];
@@ -1199,6 +2122,14 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
i++;
break;
case RESPONSE_COMPLETE:
+#ifdef XCP
+ if (to_receive[i]->state != DN_CONNECTION_STATE_ERROR_FATAL)
+ /* Continue read until ReadyForQuery */
+ break;
+ /* fallthru */
+ case RESPONSE_READY:
+ /* fallthru */
+#endif
case RESPONSE_COPY:
/* Handling is done, do not track this connection */
count--;
@@ -1206,6 +2137,11 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
if (i < count)
to_receive[i] = to_receive[count];
break;
+#ifdef XCP
+ case RESPONSE_ERROR:
+ /* no handling needed, just wait for ReadyForQuery */
+ break;
+#endif
default:
/* Inconsistent responses */
add_error_message(to_receive[i], "Unexpected response from the Datanodes");
@@ -1217,11 +2153,169 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections,
}
}
}
+#ifndef XCP
pgxc_node_report_error(combiner);
+#endif
return 0;
}
+#ifdef XCP
+/*
+ * Read next message from the connection and update the combiner
+ * and connection state accordingly
+ * If we are in an error state we just consume the messages, and do not proxy
+ * Long term, we should look into cancelling executing statements
+ * and closing the connections.
+ * It returns if states need to be handled
+ * Return values:
+ * RESPONSE_EOF - need to receive more data for the connection
+ * RESPONSE_READY - got ReadyForQuery
+ * RESPONSE_COMPLETE - done with the connection, but not yet ready for query.
+ * Also this result is output in case of error
+ * RESPONSE_SUSPENDED - got PortalSuspended
+ * RESPONSE_TUPLEDESC - got tuple description
+ * RESPONSE_DATAROW - got data row
+ * RESPONSE_COPY - got copy response
+ * RESPONSE_BARRIER_OK - barrier command completed successfully
+ */
+int
+handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner)
+{
+ char *msg;
+ int msg_len;
+ char msg_type;
+
+ for (;;)
+ {
+ /*
+ * If we are in the process of shutting down, we
+ * may be rolling back, and the buffer may contain other messages.
+ * We want to avoid a procarray exception
+ * as well as an error stack overflow.
+ */
+ if (proc_exit_inprogress)
+ conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
+
+ /*
+ * Don't read from from the connection if there is a fatal error.
+ * We still return RESPONSE_COMPLETE, not RESPONSE_ERROR, since
+ * Handling of RESPONSE_ERROR assumes sending SYNC message, but
+ * State DN_CONNECTION_STATE_ERROR_FATAL indicates connection is
+ * not usable.
+ */
+ if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ return RESPONSE_COMPLETE;
+
+ /* No data available, exit */
+ if (!HAS_MESSAGE_BUFFERED(conn))
+ return RESPONSE_EOF;
+
+ Assert(conn->combiner == combiner || conn->combiner == NULL);
+
+ /* TODO handle other possible responses */
+ msg_type = get_message(conn, &msg_len, &msg);
+ switch (msg_type)
+ {
+ case '\0': /* Not enough data in the buffer */
+ return RESPONSE_EOF;
+ case 'c': /* CopyToCommandComplete */
+ HandleCopyOutComplete(combiner);
+ break;
+ case 'C': /* CommandComplete */
+ HandleCommandComplete(combiner, msg, msg_len, conn);
+ conn->combiner = NULL;
+ if (conn->state == DN_CONNECTION_STATE_QUERY)
+ conn->state = DN_CONNECTION_STATE_IDLE;
+ return RESPONSE_COMPLETE;
+ case 'T': /* RowDescription */
+#ifdef DN_CONNECTION_DEBUG
+ Assert(!conn->have_row_desc);
+ conn->have_row_desc = true;
+#endif
+ if (HandleRowDescription(combiner, msg, msg_len))
+ return RESPONSE_TUPDESC;
+ break;
+ case 'D': /* DataRow */
+#ifdef DN_CONNECTION_DEBUG
+ Assert(conn->have_row_desc);
+#endif
+ /* Do not return if data row has not been actually handled */
+ if (HandleDataRow(combiner, msg, msg_len, conn->nodeoid))
+ return RESPONSE_DATAROW;
+ break;
+ case 's': /* PortalSuspended */
+ /* No activity is expected on the connection until next query */
+ conn->state = DN_CONNECTION_STATE_IDLE;
+ conn->combiner = NULL;
+ return RESPONSE_SUSPENDED;
+ case '1': /* ParseComplete */
+ case '2': /* BindComplete */
+ case '3': /* CloseComplete */
+ case 'n': /* NoData */
+ /* simple notifications, continue reading */
+ break;
+ case 'G': /* CopyInResponse */
+ conn->state = DN_CONNECTION_STATE_COPY_IN;
+ HandleCopyIn(combiner);
+ /* Done, return to caller to let it know the data can be passed in */
+ return RESPONSE_COPY;
+ case 'H': /* CopyOutResponse */
+ conn->state = DN_CONNECTION_STATE_COPY_OUT;
+ HandleCopyOut(combiner);
+ return RESPONSE_COPY;
+ case 'd': /* CopyOutDataRow */
+ conn->state = DN_CONNECTION_STATE_COPY_OUT;
+ HandleCopyDataRow(combiner, msg, msg_len);
+ break;
+ case 'E': /* ErrorResponse */
+ HandleError(combiner, msg, msg_len);
+ add_error_message(conn, combiner->errorMessage);
+ return RESPONSE_ERROR;
+ case 'A': /* NotificationResponse */
+ case 'N': /* NoticeResponse */
+ case 'S': /* SetCommandComplete */
+ /*
+ * Ignore these to prevent multiple messages, one from each
+ * node. Coordinator will send one for DDL anyway
+ */
+ break;
+ case 'Z': /* ReadyForQuery */
+ {
+ /*
+ * Return result depends on previous connection state.
+ * If it was PORTAL_SUSPENDED Coordinator want to send down
+ * another EXECUTE to fetch more rows, otherwise it is done
+ * with the connection
+ */
+ conn->transaction_status = msg[0];
+ conn->state = DN_CONNECTION_STATE_IDLE;
+ conn->combiner = NULL;
+#ifdef DN_CONNECTION_DEBUG
+ conn->have_row_desc = false;
+#endif
+ return RESPONSE_READY;
+ }
+ case 'M': /* Command Id */
+ HandleDatanodeCommandId(combiner, msg, msg_len);
+ break;
+ case 'b':
+ conn->state = DN_CONNECTION_STATE_IDLE;
+ return RESPONSE_BARRIER_OK;
+ case 'I': /* EmptyQuery */
+ return RESPONSE_COMPLETE;
+ default:
+ /* sync lost? */
+ elog(WARNING, "Received unsupported message type: %c", msg_type);
+ conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ /* stop reading */
+ return RESPONSE_COMPLETE;
+ }
+ }
+ /* never happen, but keep compiler quiet */
+ return RESPONSE_EOF;
+}
+#else
/*
* Read next message from the connection and update the combiner accordingly
* If we are in an error state we just consume the messages, and do not proxy
@@ -1241,7 +2335,6 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner)
char *msg;
int msg_len;
char msg_type;
- bool suspended = false;
for (;;)
{
@@ -1276,7 +2369,7 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner)
HandleCopyOutComplete(combiner);
break;
case 'C': /* CommandComplete */
- HandleCommandComplete(combiner, msg, msg_len, conn);
+ HandleCommandComplete(combiner, msg, msg_len);
break;
case 'T': /* RowDescription */
#ifdef DN_CONNECTION_DEBUG
@@ -1335,7 +2428,7 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner)
{
/*
* Return result depends on previous connection state.
- * If it was PORTAL_SUSPENDED Coordinator want to send down
+ * If it was PORTAL_SUSPENDED coordinator want to send down
* another EXECUTE to fetch more rows, otherwise it is done
* with the connection
*/
@@ -1352,8 +2445,10 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner)
HandleDatanodeCommandId(combiner, msg, msg_len);
break;
case 'b':
- conn->state = DN_CONNECTION_STATE_IDLE;
- return RESPONSE_BARRIER_OK;
+ {
+ conn->state = DN_CONNECTION_STATE_IDLE;
+ return RESPONSE_BARRIER_OK;
+ }
case 'I': /* EmptyQuery */
default:
/* sync lost? */
@@ -1366,10 +2461,11 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner)
/* never happen, but keep compiler quiet */
return RESPONSE_EOF;
}
+#endif
/*
- * Has the Datanode sent Ready For Query
+ * Has the data node sent Ready For Query
*/
bool
@@ -1378,6 +2474,7 @@ is_data_node_ready(PGXCNodeHandle * conn)
char *msg;
int msg_len;
char msg_type;
+ bool suspended = false;
for (;;)
{
@@ -1402,6 +2499,7 @@ is_data_node_ready(PGXCNodeHandle * conn)
switch (msg_type)
{
case 's': /* PortalSuspended */
+ suspended = true;
break;
case 'Z': /* ReadyForQuery */
@@ -1421,6 +2519,8 @@ is_data_node_ready(PGXCNodeHandle * conn)
return false;
}
+
+#ifndef XCP
/*
* Construct a BEGIN TRANSACTION command after taking into account the
* current options. The returned string is not palloced and is valid only until
@@ -1452,6 +2552,7 @@ generate_begin_command(void)
return begin_cmd;
}
+#endif
/*
* Send BEGIN command to the Datanodes or Coordinators and receive responses.
@@ -1464,12 +2565,20 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
{
int i;
struct timeval *timeout = NULL;
+#ifdef XCP
+ ResponseCombiner combiner;
+#else
RemoteQueryState *combiner;
+#endif
TimestampTz timestamp = GetCurrentGTMStartTimestamp();
PGXCNodeHandle *new_connections[conn_count];
int new_count = 0;
+#ifdef XCP
+ char *init_str;
+#else
int con[conn_count];
int j = 0;
+#endif
/*
* If no remote connections, we don't have anything to do
@@ -1479,6 +2588,12 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
for (i = 0; i < conn_count; i++)
{
+#ifdef XCP
+ if (!readOnly && !IsConnFromDatanode())
+ {
+ connections[i]->read_only = false;
+ }
+#else
/*
* If the node is already a participant in the transaction, skip it
*/
@@ -1493,7 +2608,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
RegisterTransactionNodes(1, (void **)&connections[i], true);
continue;
}
-
+#endif
/*
* PGXC TODO - A connection should not be in DN_CONNECTION_STATE_QUERY
* state when we are about to send a BEGIN TRANSACTION command to the
@@ -1511,13 +2626,26 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp))
return EOF;
+#ifdef XCP
+ if (IS_PGXC_DATANODE && GlobalTransactionIdIsValid(gxid))
+ need_tran_block = true;
+ /* Send BEGIN if not already in transaction */
+ if (need_tran_block && connections[i]->transaction_status == 'I')
+#else
/* Send BEGIN */
if (need_tran_block)
+#endif
{
/* Send the BEGIN TRANSACTION command and check for errors */
+#ifdef XCP
+ if (pgxc_node_send_query(connections[i], "BEGIN"))
+ return EOF;
+#else
if (pgxc_node_send_query(connections[i], generate_begin_command()))
return EOF;
+#endif
+#ifndef XCP
con[j++] = PGXCNodeGetNodeId(connections[i]->nodeoid, node_type);
/*
* Register the node as a participant in the transaction. The
@@ -1531,6 +2659,7 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
* read-write statement.
*/
RegisterTransactionNodes(1, (void **)&connections[i], !readOnly);
+#endif
new_connections[new_count++] = connections[i];
}
}
@@ -1542,6 +2671,31 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
if (new_count == 0)
return 0;
+#ifdef XCP
+ InitResponseCombiner(&combiner, new_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+
+ /* Receive responses */
+ if (pgxc_node_receive_responses(new_count, new_connections, timeout, &combiner))
+ return EOF;
+
+ /* Verify status */
+ if (!ValidateAndCloseCombiner(&combiner))
+ return EOF;
+
+ /* after transactions are started send down local set commands */
+ init_str = PGXCNodeGetTransactionParamStr();
+ if (init_str)
+ {
+ for (i = 0; i < new_count; i++)
+ {
+ pgxc_node_set_query(new_connections[i], init_str);
+ }
+ }
+#else
combiner = CreateResponseCombiner(new_count, COMBINE_TYPE_NONE);
/* Receive responses */
@@ -1568,11 +2722,701 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle **connections,
if (res != 0)
return EOF;
}
+#endif
/* No problem, let's get going */
return 0;
}
+
+#ifdef XCP
+/*
+ * Execute DISCARD ALL command on all allocated nodes to remove all session
+ * specific stuff before releasing them to pool for reuse by other sessions.
+ */
+static void
+pgxc_node_remote_cleanup_all(void)
+{
+ PGXCNodeAllHandles *handles = get_current_handles();
+ PGXCNodeHandle *new_connections[handles->co_conn_count + handles->dn_conn_count];
+ int new_conn_count = 0;
+ int i;
+ char *resetcmd = "RESET ALL;RESET SESSION AUTHORIZATION;"
+ "RESET transaction_isolation;";
+
+ /*
+ * We must handle reader and writer connections both since even a read-only
+ * needs to be cleaned up.
+ */
+ if (handles->co_conn_count + handles->dn_conn_count == 0)
+ return;
+
+ /*
+ * Send down snapshot followed by DISCARD ALL command.
+ */
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *handle = handles->coord_handles[i];
+
+ /* At this point connection should be in IDLE state */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ {
+ handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ continue;
+ }
+
+ /*
+ * We must go ahead and release connections anyway, so do not throw
+ * an error if we have a problem here.
+ */
+ if (pgxc_node_send_query(handle, resetcmd))
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to clean up data nodes")));
+ handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ continue;
+ }
+ new_connections[new_conn_count++] = handle;
+ }
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *handle = handles->datanode_handles[i];
+
+ /* At this point connection should be in IDLE state */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ {
+ handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ continue;
+ }
+
+ /*
+ * We must go ahead and release connections anyway, so do not throw
+ * an error if we have a problem here.
+ */
+ if (pgxc_node_send_query(handle, resetcmd))
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to clean up data nodes")));
+ handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
+ continue;
+ }
+ new_connections[new_conn_count++] = handle;
+ }
+
+ if (new_conn_count)
+ {
+ ResponseCombiner combiner;
+ InitResponseCombiner(&combiner, new_conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ pgxc_node_receive_responses(new_conn_count, new_connections, NULL, &combiner);
+ CloseCombiner(&combiner);
+ }
+}
+
+
+/*
+ * Prepare nodes which ran write operations during the transaction.
+ * Read only remote transactions are committed and connections are released
+ * back to the pool.
+ * Function returns the list of nodes where transaction is prepared, including
+ * local node, if requested, in format expected by the GTM server.
+ * If something went wrong the function tries to abort prepared transactions on
+ * the nodes where it succeeded and throws error. A warning is emitted if abort
+ * prepared fails.
+ * After completion remote connection handles are released.
+ */
+static char *
+pgxc_node_remote_prepare(char *prepareGID, bool localNode)
+{
+ bool isOK = true;
+ StringInfoData nodestr;
+ char prepare_cmd[256];
+ char abort_cmd[256];
+ GlobalTransactionId auxXid;
+ char *commit_cmd = "COMMIT TRANSACTION";
+ int i;
+ ResponseCombiner combiner;
+ PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ int conn_count = 0;
+ PGXCNodeAllHandles *handles = get_current_handles();
+
+ initStringInfo(&nodestr);
+ if (localNode)
+ appendStringInfoString(&nodestr, PGXCNodeName);
+
+ sprintf(prepare_cmd, "PREPARE TRANSACTION '%s'", prepareGID);
+
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /*
+ * If something went wrong already we have nothing to do here. The error
+ * will be reported at the end of the function, and we will rollback
+ * remotes as part of the error handling.
+ * Just skip to clean up section and check if we have already prepared
+ * somewhere, we should abort that prepared transaction.
+ */
+ if (!isOK)
+ goto prepare_err;
+
+ /*
+ * Skip empty slots
+ */
+ if (conn->sock == NO_SOCKET)
+ continue;
+ else if (conn->transaction_status == 'T')
+ {
+ /* Read in any pending input */
+ if (conn->state != DN_CONNECTION_STATE_IDLE)
+ BufferConnection(conn);
+
+ if (conn->read_only)
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, commit_cmd))
+ {
+ /*
+ * not a big deal, it was read only, the connection will be
+ * abandoned later.
+ */
+ ereport(LOG,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ else
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, prepare_cmd))
+ {
+ /*
+ * That is the trouble, we really want to prepare it.
+ * Just emit warning so far and go to clean up.
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send PREPARE TRANSACTION command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ char *nodename = get_pgxc_nodename(conn->nodeoid);
+ if (nodestr.len > 0)
+ appendStringInfoChar(&nodestr, ',');
+ appendStringInfoString(&nodestr, nodename);
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ /*
+ * If it fails on remote node it would just return ROLLBACK.
+ * Set the flag for the message handler so the response is
+ * verified.
+ */
+ conn->ck_resp_rollback = true;
+ }
+ }
+ }
+ else if (conn->transaction_status == 'E')
+ {
+ /*
+ * Probably can not happen, if there was a error the engine would
+ * abort anyway, even in case of explicit PREPARE.
+ * Anyway, just in case...
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("remote node %u is in error state", conn->nodeoid)));
+ }
+ }
+
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ /*
+ * If something went wrong already we have nothing to do here. The error
+ * will be reported at the end of the function, and we will rollback
+ * remotes as part of the error handling.
+ * Just skip to clean up section and check if we have already prepared
+ * somewhere, we should abort that prepared transaction.
+ */
+ if (!isOK)
+ goto prepare_err;
+
+ /*
+ * Skip empty slots
+ */
+ if (conn->sock == NO_SOCKET)
+ continue;
+ else if (conn->transaction_status == 'T')
+ {
+ if (conn->read_only)
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, commit_cmd))
+ {
+ /*
+ * not a big deal, it was read only, the connection will be
+ * abandoned later.
+ */
+ ereport(LOG,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ else
+ {
+ /* Send down prepare command */
+ if (pgxc_node_send_query(conn, prepare_cmd))
+ {
+ /*
+ * That is the trouble, we really want to prepare it.
+ * Just emit warning so far and go to clean up.
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send PREPARE TRANSACTION command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ char *nodename = get_pgxc_nodename(conn->nodeoid);
+ if (nodestr.len > 0)
+ appendStringInfoChar(&nodestr, ',');
+ appendStringInfoString(&nodestr, nodename);
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ /*
+ * If it fails on remote node it would just return ROLLBACK.
+ * Set the flag for the message handler so the response is
+ * verified.
+ */
+ conn->ck_resp_rollback = true;
+ }
+ }
+ }
+ else if (conn->transaction_status == 'E')
+ {
+ /*
+ * Probably can not happen, if there was a error the engine would
+ * abort anyway, even in case of explicit PREPARE.
+ * Anyway, just in case...
+ */
+ isOK = false;
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("remote node %u is in error state", conn->nodeoid)));
+ }
+ }
+
+ SetSendCommandId(false);
+
+ if (!isOK)
+ goto prepare_err;
+ /* exit if nothing has been prepared */
+ if (conn_count > 0)
+ {
+ int result;
+ /*
+ * Receive and check for any errors. In case of errors, we don't bail out
+ * just yet. We first go through the list of connections and look for
+ * errors on each connection. This is important to ensure that we run
+ * an appropriate ROLLBACK command later on (prepared transactions must be
+ * rolled back with ROLLBACK PREPARED commands).
+ *
+ * PGXCTODO - There doesn't seem to be a solid mechanism to track errors on
+ * individual connections. The transaction_status field doesn't get set
+ * every time there is an error on the connection. The combiner mechanism is
+ * good for parallel proessing, but I think we should have a leak-proof
+ * mechanism to track connection status
+ */
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
+ if (result || !validate_combiner(&combiner))
+ goto prepare_err;
+ else
+ CloseCombiner(&combiner);
+
+ /* Before exit clean the flag, to avoid unnecessary checks */
+ for (i = 0; i < conn_count; i++)
+ connections[i]->ck_resp_rollback = false;
+
+ pfree_pgxc_all_handles(handles);
+ if (!temp_object_included && !PersistentConnections)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+ }
+
+ return nodestr.data;
+prepare_err:
+ sprintf(abort_cmd, "ROLLBACK PREPARED '%s'", prepareGID);
+
+ auxXid = GetAuxilliaryTransactionId();
+ conn_count = 0;
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /*
+ * PREPARE succeeded on that node, roll it back there
+ */
+ if (conn->ck_resp_rollback)
+ {
+ conn->ck_resp_rollback = false;
+ /* sanity checks */
+ Assert(conn->sock != NO_SOCKET);
+ Assert(conn->transaction_status == 'I');
+ Assert(conn->state == DN_CONNECTION_STATE_IDLE);
+ /* Send down abort prepared command */
+ if (pgxc_node_send_gxid(conn, auxXid))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send xid to "
+ "the node %u", conn->nodeoid)));
+ }
+ if (pgxc_node_send_query(conn, abort_cmd))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send ABORT PREPARED command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ if (conn->ck_resp_rollback)
+ {
+ conn->ck_resp_rollback = false;
+ /* sanity checks */
+ Assert(conn->sock != NO_SOCKET);
+ Assert(conn->transaction_status = 'I');
+ Assert(conn->state = DN_CONNECTION_STATE_IDLE);
+ /* Send down abort prepared command */
+ if (pgxc_node_send_gxid(conn, auxXid))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send xid to "
+ "the node %u", conn->nodeoid)));
+ }
+ if (pgxc_node_send_query(conn, abort_cmd))
+ {
+ /*
+ * Prepared transaction is left on the node, but we can not
+ * do anything with that except warn the user.
+ */
+ ereport(WARNING,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send ABORT PREPARED command to "
+ "the node %u", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+ if (conn_count > 0)
+ {
+ /* Just read out responses, throw error from the first combiner */
+ ResponseCombiner combiner2;
+ InitResponseCombiner(&combiner2, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ pgxc_node_receive_responses(conn_count, connections, NULL, &combiner2);
+ CloseCombiner(&combiner2);
+ }
+ /*
+ * If the flag is set we are here because combiner carries error message
+ */
+ if (isOK)
+ pgxc_node_report_error(&combiner);
+ else
+ elog(ERROR, "failed to PREPARE transaction on one or more nodes");
+ return NULL;
+}
+
+
+/*
+ * Commit transactions on remote nodes.
+ * If barrier lock is set wait while it is released.
+ * Release remote connection after completion.
+ */
+static void
+pgxc_node_remote_commit(void)
+{
+ int result = 0;
+ char *commitCmd = "COMMIT TRANSACTION";
+ int i;
+ ResponseCombiner combiner;
+ PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ int conn_count = 0;
+ PGXCNodeAllHandles *handles = get_current_handles();
+
+ SetSendCommandId(false);
+
+ /*
+ * Barrier:
+ *
+ * We should acquire the BarrierLock in SHARE mode here to ensure that
+ * there are no in-progress barrier at this point. This mechanism would
+ * work as long as LWLock mechanism does not starve a EXCLUSIVE lock
+ * requester
+ */
+ LWLockAcquire(BarrierLock, LW_SHARED);
+
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ /*
+ * We do not need to commit remote node if it is not in transaction.
+ * If transaction is in error state the commit command will cause
+ * rollback, that is OK
+ */
+ if (conn->transaction_status != 'I')
+ {
+ /* Read in any pending input */
+ if (conn->state != DN_CONNECTION_STATE_IDLE)
+ BufferConnection(conn);
+
+ if (pgxc_node_send_query(conn, commitCmd))
+ {
+ /*
+ * Do not bother with clean up, just bomb out. The error handler
+ * will invoke RollbackTransaction which will do the work.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to the node %u",
+ conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ /*
+ * We do not need to commit remote node if it is not in transaction.
+ * If transaction is in error state the commit command will cause
+ * rollback, that is OK
+ */
+ if (conn->transaction_status != 'I')
+ {
+ if (pgxc_node_send_query(conn, commitCmd))
+ {
+ /*
+ * Do not bother with clean up, just bomb out. The error handler
+ * will invoke RollbackTransaction which will do the work.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send COMMIT command to the node %u",
+ conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ /*
+ * Release the BarrierLock.
+ */
+ LWLockRelease(BarrierLock);
+
+ if (conn_count)
+ {
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
+ if (result || !validate_combiner(&combiner))
+ result = EOF;
+ else
+ CloseCombiner(&combiner);
+ }
+
+ stat_transaction(conn_count);
+
+ if (result)
+ {
+ if (combiner.errorMessage)
+ pgxc_node_report_error(&combiner);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to COMMIT the transaction on one or more nodes")));
+ }
+
+ if (!temp_object_included && !PersistentConnections)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+}
+
+
+/*
+ * Rollback transactions on remote nodes.
+ * Release remote connection after completion.
+ */
+static void
+pgxc_node_remote_abort(void)
+{
+ int result = 0;
+ char *rollbackCmd = "ROLLBACK TRANSACTION";
+ int i;
+ ResponseCombiner combiner;
+ PGXCNodeHandle *connections[MaxDataNodes + MaxCoords];
+ int conn_count = 0;
+ PGXCNodeAllHandles *handles = get_current_handles();
+
+ SetSendCommandId(false);
+
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ if (conn->transaction_status != 'I')
+ {
+ /* Read in any pending input */
+ if (conn->state != DN_CONNECTION_STATE_IDLE)
+ BufferConnection(conn);
+
+ /*
+ * Do not matter, is there committed or failed transaction,
+ * just send down rollback to finish it.
+ */
+ if (pgxc_node_send_query(conn, rollbackCmd))
+ {
+ add_error_message(conn,
+ "failed to send ROLLBACK TRANSACTION command");
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+
+ /* Skip empty slots */
+ if (conn->sock == NO_SOCKET)
+ continue;
+
+ if (conn->transaction_status != 'I')
+ {
+ /*
+ * Do not matter, is there committed or failed transaction,
+ * just send down rollback to finish it.
+ */
+ if (pgxc_node_send_query(conn, rollbackCmd))
+ {
+ add_error_message(conn,
+ "failed to send ROLLBACK TRANSACTION command");
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+ }
+
+ if (conn_count)
+ {
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ result = pgxc_node_receive_responses(conn_count, connections, NULL, &combiner);
+ if (result || !validate_combiner(&combiner))
+ result = EOF;
+ else
+ CloseCombiner(&combiner);
+ }
+
+ stat_transaction(conn_count);
+
+ if (result)
+ {
+ if (combiner.errorMessage)
+ pgxc_node_report_error(&combiner);
+ else
+ ereport(LOG,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to ROLLBACK the transaction on one or more nodes")));
+ }
+}
+
+#else
+
/*
* Prepare all remote nodes involved in this transaction. The local node is
* handled separately and prepared first in xact.c. If there is any error
@@ -1922,6 +3766,8 @@ pgxc_node_remote_commit(void)
}
}
+ stat_transaction(write_conn_count + read_conn_count);
+
if (result)
{
if (combiner)
@@ -2079,11 +3925,133 @@ pgxc_node_remote_abort(void)
return;
}
+#endif
+
/*
* Begin COPY command
* The copy_connections array must have room for NumDataNodes items
*/
+#ifdef XCP
+void
+DataNodeCopyBegin(RemoteCopyData *rcstate)
+{
+ int i;
+ List *nodelist = rcstate->rel_loc->nodeList;
+ PGXCNodeHandle **connections;
+ bool need_tran_block;
+ GlobalTransactionId gxid;
+ ResponseCombiner combiner;
+ Snapshot snapshot = GetActiveSnapshot();
+ int conn_count = list_length(nodelist);
+
+ /* Get needed datanode connections */
+ if (!rcstate->is_from && IsLocatorReplicated(rcstate->rel_loc->locatorType))
+ {
+ /* Connections is a single handle to read from */
+ connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *));
+ connections[0] = get_any_handle(nodelist);
+ conn_count = 1;
+ }
+ else
+ {
+ PGXCNodeAllHandles *pgxc_handles;
+ pgxc_handles = get_handles(nodelist, NULL, false);
+ connections = pgxc_handles->datanode_handles;
+ Assert(pgxc_handles->dn_conn_count == conn_count);
+ pfree(pgxc_handles);
+ }
+
+ /*
+ * If more than one nodes are involved or if we are already in a
+ * transaction block, we must the remote statements in a transaction block
+ */
+ need_tran_block = (conn_count > 1) || (TransactionBlockStatusCode() == 'T');
+
+ elog(DEBUG1, "conn_count = %d, need_tran_block = %s", conn_count,
+ need_tran_block ? "true" : "false");
+
+ /* Gather statistics */
+ stat_statement();
+ stat_transaction(conn_count);
+
+ gxid = GetCurrentTransactionId();
+
+ /* Start transaction on connections where it is not started */
+ if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false, PGXC_NODE_DATANODE))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not begin transaction on data nodes.")));
+ }
+
+ /*
+ * COPY TO do not use locator, it just takes connections from it, and
+ * we do not look up distribution data type in this case.
+ * So always use LOCATOR_TYPE_RROBIN to avoid errors because of not
+ * defined partType if real locator type is HASH or MODULO.
+ * Create locator before sending down query, because createLocator may
+ * fail and we leave with dirty connections.
+ * If we get an error now datanode connection will be clean and error
+ * handler will issue transaction abort.
+ */
+ rcstate->locator = createLocator(
+ rcstate->is_from ? rcstate->rel_loc->locatorType
+ : LOCATOR_TYPE_RROBIN,
+ rcstate->is_from ? RELATION_ACCESS_INSERT : RELATION_ACCESS_READ,
+ rcstate->dist_type,
+ LOCATOR_LIST_POINTER,
+ conn_count,
+ (void *) connections,
+ NULL,
+ false);
+
+ /* Send query to nodes */
+ for (i = 0; i < conn_count; i++)
+ {
+ CHECK_OWNERSHIP(connections[i], NULL);
+
+ if (snapshot && pgxc_node_send_snapshot(connections[i], snapshot))
+ {
+ add_error_message(connections[i], "Can not send request");
+ pfree(connections);
+ freeLocator(rcstate->locator);
+ rcstate->locator = NULL;
+ return;
+ }
+ if (pgxc_node_send_query(connections[i], rcstate->query_buf.data) != 0)
+ {
+ add_error_message(connections[i], "Can not send request");
+ pfree(connections);
+ freeLocator(rcstate->locator);
+ rcstate->locator = NULL;
+ return;
+ }
+ }
+
+ /*
+ * We are expecting CopyIn response, but do not want to send it to client,
+ * caller should take care about this, because here we do not know if
+ * client runs console or file copy
+ */
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+
+ /* Receive responses */
+ if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner)
+ || !ValidateAndCloseCombiner(&combiner))
+ {
+ DataNodeCopyFinish(conn_count, connections);
+ freeLocator(rcstate->locator);
+ rcstate->locator = NULL;
+ return;
+ }
+ pfree(connections);
+}
+#else
PGXCNodeHandle**
DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot)
{
@@ -2127,6 +4095,10 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot)
foreach(nodeitem, nodelist)
copy_connections[lfirst_int(nodeitem)] = connections[i++];
+ /* Gather statistics */
+ stat_statement();
+ stat_transaction(conn_count);
+
gxid = GetCurrentTransactionId();
if (!GlobalTransactionIdIsValid(gxid))
@@ -2185,10 +4157,91 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot)
pfree(connections);
return copy_connections;
}
+#endif
+
/*
* Send a data row to the specified nodes
*/
+#ifdef XCP
+int
+DataNodeCopyIn(char *data_row, int len, int conn_count, PGXCNodeHandle** copy_connections)
+{
+ /* size + data row + \n */
+ int msgLen = 4 + len + 1;
+ int nLen = htonl(msgLen);
+ int i;
+
+ for(i = 0; i < conn_count; i++)
+ {
+ PGXCNodeHandle *handle = copy_connections[i];
+ if (handle->state == DN_CONNECTION_STATE_COPY_IN)
+ {
+ /* precalculate to speed up access */
+ int bytes_needed = handle->outEnd + 1 + msgLen;
+
+ /* flush buffer if it is almost full */
+ if (bytes_needed > COPY_BUFFER_SIZE)
+ {
+ int to_send = handle->outEnd;
+
+ /* First look if data node has sent a error message */
+ int read_status = pgxc_node_read_data(handle, true);
+ if (read_status == EOF || read_status < 0)
+ {
+ add_error_message(handle, "failed to read data from data node");
+ return EOF;
+ }
+
+ if (handle->inStart < handle->inEnd)
+ {
+ ResponseCombiner combiner;
+ InitResponseCombiner(&combiner, 1, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+ handle_response(handle, &combiner);
+ if (!ValidateAndCloseCombiner(&combiner))
+ return EOF;
+ }
+
+ if (DN_CONNECTION_STATE_ERROR(handle))
+ return EOF;
+
+ /*
+ * Try to send down buffered data if we have
+ */
+ if (to_send && send_some(handle, to_send) < 0)
+ {
+ add_error_message(handle, "failed to send data to data node");
+ return EOF;
+ }
+ }
+
+ if (ensure_out_buffer_capacity(bytes_needed, handle) != 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'd';
+ memcpy(handle->outBuffer + handle->outEnd, &nLen, 4);
+ handle->outEnd += 4;
+ memcpy(handle->outBuffer + handle->outEnd, data_row, len);
+ handle->outEnd += len;
+ handle->outBuffer[handle->outEnd++] = '\n';
+ }
+ else
+ {
+ add_error_message(handle, "Invalid data node connection");
+ return EOF;
+ }
+ }
+ return 0;
+}
+#else
int
DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections)
{
@@ -2340,7 +4393,81 @@ DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle**
}
return 0;
}
+#endif
+
+
+#ifdef XCP
+uint64
+DataNodeCopyOut(PGXCNodeHandle** copy_connections,
+ int conn_count, FILE* copy_file)
+{
+ ResponseCombiner combiner;
+ uint64 processed;
+ bool error;
+
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+ combiner.processed = 0;
+ /* If there is an existing file where to copy data, pass it to combiner */
+ if (copy_file)
+ {
+ combiner.copy_file = copy_file;
+ combiner.remoteCopyType = REMOTE_COPY_FILE;
+ }
+ else
+ {
+ combiner.copy_file = NULL;
+ combiner.remoteCopyType = REMOTE_COPY_STDOUT;
+ }
+ error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0);
+
+ processed = combiner.processed;
+
+ if (!ValidateAndCloseCombiner(&combiner) || error)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type)));
+ }
+
+ return processed;
+}
+
+
+uint64
+DataNodeCopyStore(PGXCNodeHandle** copy_connections,
+ int conn_count, Tuplestorestate* store)
+{
+ ResponseCombiner combiner;
+ uint64 processed;
+ bool error;
+
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_SUM);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+ combiner.processed = 0;
+ combiner.remoteCopyType = REMOTE_COPY_TUPLESTORE;
+ combiner.tuplestorestate = store;
+
+ error = (pgxc_node_receive_responses(conn_count, copy_connections, NULL, &combiner) != 0);
+ processed = combiner.processed;
+
+ if (!ValidateAndCloseCombiner(&combiner) || error)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("Unexpected response from the data nodes when combining, request type %d", combiner.request_type)));
+ }
+
+ return processed;
+}
+#else
uint64
DataNodeCopyOut(ExecNodes *exec_nodes,
PGXCNodeHandle** copy_connections,
@@ -2416,16 +4543,28 @@ DataNodeCopyOut(ExecNodes *exec_nodes,
return processed;
}
+#endif
+
/*
* Finish copy process on all connections
*/
+#ifdef XCP
+void
+DataNodeCopyFinish(int conn_count, PGXCNodeHandle** connections)
+#else
void
DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, CombineType combine_type)
+#endif
{
int i;
+#ifdef XCP
+ ResponseCombiner combiner;
+#else
RemoteQueryState *combiner = NULL;
+#endif
bool error = false;
+#ifndef XCP
struct timeval *timeout = NULL; /* wait forever */
PGXCNodeHandle *connections[NumDataNodes];
PGXCNodeHandle *primary_handle = NULL;
@@ -2453,6 +4592,7 @@ DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, Comb
combiner = CreateResponseCombiner(conn_count + 1, combine_type);
error = (pgxc_node_receive_responses(1, &primary_handle, timeout, combiner) != 0) || error;
}
+#endif
for (i = 0; i < conn_count; i++)
{
@@ -2463,11 +4603,22 @@ DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, Comb
error = DataNodeCopyEnd(handle, false);
}
+#ifdef XCP
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+ error = (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) != 0) || error;
+
+ if (!ValidateAndCloseCombiner(&combiner) || error)
+#else
if (!combiner)
combiner = CreateResponseCombiner(conn_count, combine_type);
error = (pgxc_node_receive_responses(conn_count, connections, timeout, combiner) != 0) || error;
if (!ValidateAndCloseCombiner(combiner) || error)
+#endif
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg("Error while running COPY")));
@@ -2503,6 +4654,8 @@ DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error)
return false;
}
+
+#ifndef XCP
RemoteQueryState *
ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags)
{
@@ -2555,7 +4708,12 @@ ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags)
* If there are parameters supplied, get them into a form to be sent to the
* Datanodes with bind message. We should not have had done this before.
*/
- SetDataRowForExtParams(estate->es_param_list_info, remotestate);
+ if (estate->es_param_list_info)
+ {
+ Assert(!remotestate->paramval_data);
+ remotestate->paramval_len = ParamListToDataRow(estate->es_param_list_info,
+ &remotestate->paramval_data);
+ }
/*
* Initialize result tuple type and projection info.
@@ -2571,6 +4729,8 @@ ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags)
return remotestate;
}
+#endif
+
/*
* Get Node connections depending on the connection type:
@@ -2596,8 +4756,14 @@ get_exec_connections(RemoteQueryState *planstate,
if (exec_type == EXEC_ON_COORDS)
is_query_coord_only = true;
+#ifdef XCP
+ if (exec_type == EXEC_ON_CURRENT)
+ return get_current_handles();
+#endif
+
if (exec_nodes)
{
+#ifndef XCP
if (exec_nodes->en_expr)
{
/* execution time determining of target Datanodes */
@@ -2644,10 +4810,7 @@ get_exec_connections(RemoteQueryState *planstate,
else if (nodes)
{
if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
- {
nodelist = exec_nodes->nodeList;
- primarynode = exec_nodes->primarynodelist;
- }
}
if (nodes)
@@ -2655,6 +4818,7 @@ get_exec_connections(RemoteQueryState *planstate,
FreeRelationLocInfo(rel_loc_info);
}
else
+#endif
{
if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
nodelist = exec_nodes->nodeList;
@@ -2734,22 +4898,31 @@ get_exec_connections(RemoteQueryState *planstate,
return pgxc_handles;
}
+
static bool
pgxc_start_command_on_connection(PGXCNodeHandle *connection,
RemoteQueryState *remotestate,
Snapshot snapshot)
{
CommandId cid;
+#ifdef XCP
+ ResponseCombiner *combiner = (ResponseCombiner *) remotestate;
+ RemoteQuery *step = (RemoteQuery *) combiner->ss.ps.plan;
+ CHECK_OWNERSHIP(connection, combiner);
+#else
RemoteQuery *step = (RemoteQuery *) remotestate->ss.ps.plan;
if (connection->state == DN_CONNECTION_STATE_QUERY)
BufferConnection(connection);
+#endif
/*
* Scan descriptor would be valid and would contain a valid snapshot
* in cases when we need to send out of order command id to data node
* e.g. in case of a fetch
*/
-
+#ifdef XCP
+ cid = GetCurrentCommandId(false);
+#else
if (remotestate->cursor != NULL &&
remotestate->cursor[0] != '\0' &&
remotestate->ss.ss_currentScanDesc != NULL &&
@@ -2770,29 +4943,26 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection,
else
cid = GetCurrentCommandId(false);
}
+#endif
if (pgxc_node_send_cmd_id(connection, cid) < 0 )
return false;
if (snapshot && pgxc_node_send_snapshot(connection, snapshot))
return false;
- if (step->statement || step->cursor || remotestate->rqs_num_params)
+ if (step->statement || step->cursor || step->remote_param_types)
{
/* need to use Extended Query Protocol */
int fetch = 0;
bool prepared = false;
- bool send_desc = false;
-
- if (step->base_tlist != NULL ||
- step->exec_nodes->accesstype == RELATION_ACCESS_READ ||
- step->has_row_marks)
- send_desc = true;
+#ifndef XCP
/* if prepared statement is referenced see if it is already exist */
if (step->statement)
prepared = ActivateDatanodeStatementOnNode(step->statement,
PGXCNodeGetNodeId(connection->nodeoid,
PGXC_NODE_DATANODE));
+#endif
/*
* execute and fetch rows only if they will be consumed
* immediately by the sorter
@@ -2800,15 +4970,19 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection,
if (step->cursor)
fetch = 1;
+#ifdef XCP
+ combiner->extended_query = true;
+#endif
+
if (pgxc_node_send_query_extended(connection,
prepared ? NULL : step->sql_statement,
step->statement,
step->cursor,
- remotestate->rqs_num_params,
- remotestate->rqs_param_types,
+ step->remote_num_params,
+ step->remote_param_types,
remotestate->paramval_len,
remotestate->paramval_data,
- send_desc,
+ step->has_row_marks ? true : step->read_only,
fetch) != 0)
return false;
}
@@ -2821,31 +4995,8 @@ pgxc_start_command_on_connection(PGXCNodeHandle *connection,
}
-/*
- * IsReturningDMLOnReplicatedTable
- *
- * This function returns true if the passed RemoteQuery
- * 1. Operates on a table that is replicated
- * 2. Represents a DML
- * 3. Has a RETURNING clause in it
- *
- * If the passed RemoteQuery has a non null base_tlist
- * means that DML has a RETURNING clause.
- */
-
-static bool
-IsReturningDMLOnReplicatedTable(RemoteQuery *rq)
-{
- if (IsExecNodesReplicated(rq->exec_nodes) &&
- rq->base_tlist != NULL && /* Means DML has RETURNING */
- (rq->exec_nodes->accesstype == RELATION_ACCESS_UPDATE ||
- rq->exec_nodes->accesstype == RELATION_ACCESS_INSERT))
- return true;
-
- return false;
-}
-
-void
+#ifndef XCP
+static void
do_query(RemoteQueryState *node)
{
RemoteQuery *step = (RemoteQuery *) node->ss.ps.plan;
@@ -2857,18 +5008,12 @@ do_query(RemoteQueryState *node)
PGXCNodeHandle **connections = NULL;
PGXCNodeHandle *primaryconnection = NULL;
int i;
- int regular_conn_count = 0;
+ int regular_conn_count;
+ int total_conn_count;
bool need_tran_block;
PGXCNodeAllHandles *pgxc_connections;
/*
- * A Postgres-XC node cannot run transactions while in recovery as
- * this operation needs transaction IDs. This is more a safety guard than anything else.
- */
- if (RecoveryInProgress())
- elog(ERROR, "cannot run transaction to remote nodes during recovery");
-
- /*
* Remember if the remote query is accessing a temp object
*
* !! PGXC TODO Check if the is_temp flag is propogated correctly when a
@@ -2878,22 +5023,6 @@ do_query(RemoteQueryState *node)
ExecSetTempObjectIncluded();
/*
- * Consider a test case
- *
- * create table rf(a int, b int) distributed by replication;
- * insert into rf values(1,2),(3,4) returning ctid;
- *
- * While inserting the first row do_query works fine, receives the returned
- * row from the first connection and returns it. In this iteration the other
- * datanodes also had returned rows but they have not yet been read from the
- * network buffers. On Next Iteration do_query does not enter the data
- * receiving loop because it finds that node->connections is not null.
- * It is therefore required to set node->connections to null here.
- */
- if (node->conn_count == 0)
- node->connections = NULL;
-
- /*
* Get connections for Datanodes only, utilities and DDLs
* are launched in ExecRemoteUtility
*/
@@ -2902,17 +5031,19 @@ do_query(RemoteQueryState *node)
if (step->exec_type == EXEC_ON_DATANODES)
{
connections = pgxc_connections->datanode_handles;
- regular_conn_count = pgxc_connections->dn_conn_count;
+ total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count;
}
else if (step->exec_type == EXEC_ON_COORDS)
{
connections = pgxc_connections->coord_handles;
- regular_conn_count = pgxc_connections->co_conn_count;
+ total_conn_count = regular_conn_count = pgxc_connections->co_conn_count;
}
primaryconnection = pgxc_connections->primary_handle;
- /* Primary connection is counted separately */
+ /*
+ * Primary connection is counted separately but is included in total_conn_count if used.
+ */
if (primaryconnection)
regular_conn_count--;
@@ -2951,6 +5082,9 @@ do_query(RemoteQueryState *node)
"need_tran_block = %s", primaryconnection ? "true" : "false",
regular_conn_count, need_tran_block ? "true" : "false");
+ stat_statement();
+ stat_transaction(total_conn_count);
+
gxid = GetCurrentTransactionId();
if (!GlobalTransactionIdIsValid(gxid))
@@ -2992,30 +5126,6 @@ do_query(RemoteQueryState *node)
res = handle_response(primaryconnection, node);
if (res == RESPONSE_COMPLETE)
break;
- else if (res == RESPONSE_TUPDESC)
- {
- ExecSetSlotDescriptor(scanslot, node->tuple_desc);
- /*
- * Now tuple table slot is responsible for freeing the
- * descriptor
- */
- node->tuple_desc = NULL;
- /*
- * RemoteQuery node doesn't support backward scan, so
- * randomAccess is false, neither we want this tuple store
- * persist across transactions.
- */
- node->tuplestorestate = tuplestore_begin_heap(false, false, work_mem);
- tuplestore_set_eflags(node->tuplestorestate, node->eflags);
- }
- else if (res == RESPONSE_DATAROW)
- {
- pfree(node->currentRow.msg);
- node->currentRow.msg = NULL;
- node->currentRow.msglen = 0;
- node->currentRow.msgnode = 0;
- continue;
- }
else if (res == RESPONSE_EOF)
continue;
else
@@ -3106,15 +5216,33 @@ do_query(RemoteQueryState *node)
* descriptor
*/
node->tuple_desc = NULL;
- if (node->rqs_for_sort)
+ if (step->sort)
{
+ SimpleSort *sort = step->sort;
+
+ node->connections = connections;
+ node->conn_count = regular_conn_count;
/*
* First message is already in the buffer
- * Further fetch will be under the control of Sort plan
- * above. So, don't wait till first row is fetched.
+ * Further fetch will be under tuplesort control
+ * If query does not produce rows tuplesort will not
+ * be initialized
+ */
+ node->tuplesortstate = tuplesort_begin_merge(
+ scanslot->tts_tupleDescriptor,
+ sort->numCols,
+ sort->sortColIdx,
+ sort->sortOperators,
+ sort->sortCollations,
+ sort->nullsFirst,
+ node,
+ work_mem);
+ /*
+ * Break the loop, do not wait for first row.
+ * Tuplesort module want to control node it is
+ * fetching rows from, while in this loop first
+ * row would be got from random node
*/
- node->connections = connections;
- node->conn_count = regular_conn_count;
break;
}
else
@@ -3145,15 +5273,6 @@ do_query(RemoteQueryState *node)
}
/* report error if any */
pgxc_node_report_error(node);
-
- if (node->rqs_for_sort)
- {
- /*
- * Break the loop, do not wait for first row. See comment above for
- * rqs_for_sort.
- */
- break;
- }
}
if (node->cursor_count)
@@ -3223,6 +5342,13 @@ RemoteQueryNext(ScanState *scan_node)
node->update_cursor = NULL;
pfree_pgxc_all_handles(all_dn_handles);
}
+
+ /* We can't have both tuplesortstate and tuplestorestate */
+ Assert(!(node->tuplesortstate && node->tuplestorestate));
+
+ if (node->tuplesortstate)
+ tuplesort_gettupleslot((Tuplesortstate *) node->tuplesortstate,
+ true, scanslot);
else if(node->tuplestorestate)
{
/*
@@ -3242,50 +5368,7 @@ RemoteQueryNext(ScanState *scan_node)
eof_tuplestore = true;
}
- /*
- * Consider a test case
- *
- * create table ta1 (v1 int, v2 int);
- * insert into ta1 values(1,2),(2,3),(3,4);
- *
- * create table ta2 (v1 int, v2 int);
- * insert into ta2 values(1,2),(2,3),(3,4);
- *
- * select t1.ctid, t2.ctid,* from ta1 t1, ta2 t2
- * where t2.v2<=3 order by t1.v1;
- * ctid | ctid | v1 | v2 | v1 | v2
- * -------+-------+----+----+----+----
- * Row_1 (0,1) | (0,1) | 1 | 2 | 1 | 2
- * Row_2 (0,1) | (0,2) | 1 | 2 | 2 | 3
- * Row_3 (0,2) | (0,1) | 2 | 3 | 1 | 2
- * Row_4 (0,2) | (0,2) | 2 | 3 | 2 | 3
- * Row_5 (0,1) | (0,1) | 3 | 4 | 1 | 2
- * Row_6 (0,1) | (0,2) | 3 | 4 | 2 | 3
- * (6 rows)
- *
- * Note that in the resulting join, we are getting one row of ta1 twice,
- * as shown by the ctid's in the results. Now consider this update
- *
- * update ta1 t1 set v2=t1.v2+10 from ta2 t2
- * where t2.v2<=3 returning t1.ctid,t1.v1 t1_v1, t1.v2 t1_v2;
- *
- * The first iteration of the update runs for Row_1, succeeds and
- * updates its ctid to say (0,3). In the second iteration for Row_2,
- * since the ctid of the row has already changed, fails to update any
- * row and hence do_query does not return any tuple. The FetchTuple
- * call in RemoteQueryNext hence fails and eof_underlying is set to true.
- * However in the third iteration for Row_3, the update succeeds and
- * returns a row, but since the eof_underlying is already set to true,
- * the RemoteQueryNext does not bother calling FetchTuple, we therefore
- * do not get more than one row returned as a result of the update
- * returning query. It is therefore required in RemoteQueryNext to call
- * FetchTuple in case do_query has copied a row in node->currentRow.msg.
- * Also we have to reset the eof_underlying flag every time
- * FetchTuple succeeds to clear any previously set status.
- */
- if (eof_tuplestore &&
- (!node->eof_underlying ||
- (node->currentRow.msg != NULL)))
+ if (eof_tuplestore && !node->eof_underlying)
{
/*
* If tuplestore has reached its end but the underlying RemoteQueryNext() hasn't
@@ -3293,15 +5376,13 @@ RemoteQueryNext(ScanState *scan_node)
*/
if (FetchTuple(node, scanslot))
{
- /* See comments a couple of lines above */
- node->eof_underlying = false;
- /*
- * Append a copy of the returned tuple to tuplestore. NOTE: because
- * the tuplestore is certainly in EOF state, its read position will
- * move forward over the added tuple. This is what we want.
- */
- if (tuplestorestate && !TupIsNull(scanslot))
- tuplestore_puttupleslot(tuplestorestate, scanslot);
+ /*
+ * Append a copy of the returned tuple to tuplestore. NOTE: because
+ * the tuplestore is certainly in EOF state, its read position will
+ * move forward over the added tuple. This is what we want.
+ */
+ if (tuplestorestate && !TupIsNull(scanslot))
+ tuplestore_puttupleslot(tuplestorestate, scanslot);
}
else
node->eof_underlying = true;
@@ -3310,8 +5391,7 @@ RemoteQueryNext(ScanState *scan_node)
if (eof_tuplestore && node->eof_underlying)
ExecClearTuple(scanslot);
}
- else if (node->rqs_for_sort)
- getrow_for_tapesort(node, scanslot);
+ /* No tuple store whatsoever, no result from the datanode */
else
ExecClearTuple(scanslot);
@@ -3368,9 +5448,13 @@ ExecEndRemoteQuery(RemoteQueryState *node)
if (res == RESPONSE_EOF)
{
struct timeval timeout;
+#ifdef XCP
+ timeout.tv_sec = END_QUERY_TIMEOUT / 1000;
+ timeout.tv_usec = (END_QUERY_TIMEOUT % 1000) * 1000;
+#else
timeout.tv_sec = END_QUERY_TIMEOUT;
timeout.tv_usec = 0;
-
+#endif
if (pgxc_node_receive(1, &conn, &timeout))
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
@@ -3378,9 +5462,15 @@ ExecEndRemoteQuery(RemoteQueryState *node)
}
}
- if (node->tuplestorestate != NULL)
+ if (node->tuplesortstate != NULL || node->tuplestorestate != NULL)
ExecClearTuple(node->ss.ss_ScanTupleSlot);
/*
+ * Release tuplesort resources
+ */
+ if (node->tuplesortstate != NULL)
+ tuplesort_end((Tuplesortstate *) node->tuplesortstate);
+ node->tuplesortstate = NULL;
+ /*
* Release tuplestore resources
*/
if (node->tuplestorestate != NULL)
@@ -3441,18 +5531,12 @@ ExecEndRemoteQuery(RemoteQueryState *node)
node->paramval_len = 0;
}
- /* Free the param types if they are newly allocated */
- if (node->rqs_param_types &&
- node->rqs_param_types != ((RemoteQuery*)node->ss.ps.plan)->rq_param_types)
- {
- pfree(node->rqs_param_types);
- node->rqs_param_types = NULL;
- node->rqs_num_params = 0;
- }
-
if (node->ss.ss_currentRelation)
ExecCloseScanRelation(node->ss.ss_currentRelation);
+ if (node->tmp_ctx)
+ MemoryContextDelete(node->tmp_ctx);
+
CloseCombiner(node);
}
@@ -3506,52 +5590,35 @@ close_node_cursors(PGXCNodeHandle **connections, int conn_count, char *cursor)
ValidateAndCloseCombiner(combiner);
}
+#endif
/*
* Encode parameter values to format of DataRow message (the same format is
* used in Bind) to prepare for sending down to Datanodes.
- * The data row is copied to RemoteQueryState.paramval_data.
+ * The buffer to store encoded value is palloc'ed and returned as the result
+ * parameter. Function returns size of the result
*/
-void
-SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState *rq_state)
+int
+ParamListToDataRow(ParamListInfo params, char** result)
{
StringInfoData buf;
uint16 n16;
int i;
int real_num_params = 0;
- RemoteQuery *node = (RemoteQuery*) rq_state->ss.ps.plan;
-
- /* If there are no parameters, there is no data to BIND. */
- if (!paraminfo)
- return;
-
- /*
- * If this query has been generated internally as a part of two-step DML
- * statement, it uses only the internal parameters for input values taken
- * from the source data, and it never uses external parameters. So even if
- * parameters were being set externally, they won't be present in this
- * statement (they might be present in the source data query). In such
- * case where parameters refer to the values returned by SELECT query, the
- * parameter data and parameter types would be set in SetDataRowForIntParams().
- */
- if (node->rq_params_internal)
- return;
-
- Assert(!rq_state->paramval_data);
/*
* It is necessary to fetch parameters
* before looking at the output value.
*/
- for (i = 0; i < paraminfo->numParams; i++)
+ for (i = 0; i < params->numParams; i++)
{
ParamExternData *param;
- param = &paraminfo->params[i];
+ param = &params->params[i];
- if (!OidIsValid(param->ptype) && paraminfo->paramFetch != NULL)
- (*paraminfo->paramFetch) (paraminfo, i + 1);
+ if (!OidIsValid(param->ptype) && params->paramFetch != NULL)
+ (*params->paramFetch) (params, i + 1);
/*
* This is the last parameter found as useful, so we need
@@ -3570,9 +5637,8 @@ SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState *rq_state)
*/
if (real_num_params == 0)
{
- rq_state->paramval_data = NULL;
- rq_state->paramval_len = 0;
- return;
+ *result = NULL;
+ return 0;
}
initStringInfo(&buf);
@@ -3584,7 +5650,7 @@ SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState *rq_state)
/* Parameter values */
for (i = 0; i < real_num_params; i++)
{
- ParamExternData *param = &paraminfo->params[i];
+ ParamExternData *param = &params->params[i];
uint32 n32;
/*
@@ -3627,38 +5693,15 @@ SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState *rq_state)
}
}
-
- /*
- * If parameter types are not already set, infer them from
- * the paraminfo.
- */
- if (node->rq_num_params > 0)
- {
- /*
- * Use the already known param types for BIND. Parameter types
- * can be already known when the same plan is executed multiple
- * times.
- */
- if (node->rq_num_params != real_num_params)
- elog(ERROR, "Number of user-supplied parameters do not match "
- "the number of remote parameters");
- rq_state->rqs_num_params = node->rq_num_params;
- rq_state->rqs_param_types = node->rq_param_types;
- }
- else
- {
- rq_state->rqs_num_params = real_num_params;
- rq_state->rqs_param_types = (Oid *) palloc(sizeof(Oid) * real_num_params);
- for (i = 0; i < real_num_params; i++)
- rq_state->rqs_param_types[i] = paraminfo->params[i].ptype;
- }
-
- /* Assign the newly allocated data row to paramval */
- rq_state->paramval_data = buf.data;
- rq_state->paramval_len = buf.len;
+ /* Take data from the buffer */
+ *result = palloc(buf.len);
+ memcpy(*result, buf.data, buf.len);
+ pfree(buf.data);
+ return buf.len;
}
+#ifndef XCP
/* ----------------------------------------------------------------
* ExecRemoteQueryReScan
*
@@ -3673,11 +5716,23 @@ ExecRemoteQueryReScan(RemoteQueryState *node, ExprContext *exprCtxt)
*/
ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
- if (!node->tuplestorestate)
- return;
+ if (((RemoteQuery *) node->ss.ps.plan)->sort)
+ {
+ if (!node->tuplesortstate)
+ return;
+
+ tuplesort_rescan(node->tuplesortstate);
+ }
+ else
+ {
+ if (!node->tuplestorestate)
+ return;
+
+ tuplestore_rescan(node->tuplestorestate);
+ }
- tuplestore_rescan(node->tuplestorestate);
}
+#endif
/*
@@ -3695,10 +5750,17 @@ void
ExecRemoteUtility(RemoteQuery *node)
{
RemoteQueryState *remotestate;
+#ifdef XCP
+ ResponseCombiner *combiner;
+#endif
bool force_autocommit = node->force_autocommit;
RemoteQueryExecType exec_type = node->exec_type;
GlobalTransactionId gxid = InvalidGlobalTransactionId;
+#ifdef XCP
+ Snapshot snapshot = NULL;
+#else
Snapshot snapshot = GetActiveSnapshot();
+#endif
PGXCNodeAllHandles *pgxc_connections;
int co_conn_count;
int dn_conn_count;
@@ -3709,6 +5771,11 @@ ExecRemoteUtility(RemoteQuery *node)
if (!force_autocommit)
RegisterTransactionLocalNode(true);
+#ifdef XCP
+ remotestate = makeNode(RemoteQueryState);
+ combiner = (ResponseCombiner *)remotestate;
+ InitResponseCombiner(combiner, 0, node->combine_type);
+#else
/*
* It is possible to invoke create table with inheritance on
* temporary objects. Remember that we might have accessed a temp object
@@ -3717,11 +5784,17 @@ ExecRemoteUtility(RemoteQuery *node)
ExecSetTempObjectIncluded();
remotestate = CreateResponseCombiner(0, node->combine_type);
+#endif
pgxc_connections = get_exec_connections(NULL, node->exec_nodes, exec_type);
dn_conn_count = pgxc_connections->dn_conn_count;
co_conn_count = pgxc_connections->co_conn_count;
+#ifdef XCP
+ /* exit right away if no nodes to run command on */
+ if (dn_conn_count == 0 && co_conn_count == 0)
+ return;
+#endif
if (force_autocommit)
need_tran_block = false;
@@ -3741,12 +5814,18 @@ ExecRemoteUtility(RemoteQuery *node)
}
gxid = GetCurrentTransactionId();
+#ifdef XCP
+ if (ActiveSnapshotSet())
+ snapshot = GetActiveSnapshot();
+#endif
if (!GlobalTransactionIdIsValid(gxid))
ereport(ERROR,
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg("Failed to get next transaction ID")));
+#ifndef XCP
if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_DATANODES)
+#endif
{
if (pgxc_node_begin(dn_conn_count, pgxc_connections->datanode_handles,
gxid, need_tran_block, false, PGXC_NODE_DATANODE))
@@ -3774,7 +5853,9 @@ ExecRemoteUtility(RemoteQuery *node)
}
}
+#ifndef XCP
if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_COORDS)
+#endif
{
if (pgxc_node_begin(co_conn_count, pgxc_connections->coord_handles,
gxid, need_tran_block, false, PGXC_NODE_COORDINATOR))
@@ -3803,8 +5884,10 @@ ExecRemoteUtility(RemoteQuery *node)
* Stop if all commands are completed or we got a data row and
* initialized state node for subsequent invocations
*/
+#ifndef XCP
if (exec_type == EXEC_ON_ALL_NODES ||
exec_type == EXEC_ON_DATANODES)
+#endif
{
while (dn_conn_count > 0)
{
@@ -3822,12 +5905,26 @@ ExecRemoteUtility(RemoteQuery *node)
while (i < dn_conn_count)
{
PGXCNodeHandle *conn = pgxc_connections->datanode_handles[i];
+#ifdef XCP
+ int res = handle_response(conn, combiner);
+#else
int res = handle_response(conn, remotestate);
+#endif
if (res == RESPONSE_EOF)
{
i++;
}
else if (res == RESPONSE_COMPLETE)
+#ifdef XCP
+ {
+ /* Ignore, wait for ReadyForQuery */
+ }
+ else if (res == RESPONSE_ERROR)
+ {
+ /* Ignore, wait for ReadyForQuery */
+ }
+ else if (res == RESPONSE_READY)
+#endif
{
if (i < --dn_conn_count)
pgxc_connections->datanode_handles[i] =
@@ -3850,8 +5947,10 @@ ExecRemoteUtility(RemoteQuery *node)
}
/* Make the same for Coordinators */
+#ifndef XCP
if (exec_type == EXEC_ON_ALL_NODES ||
exec_type == EXEC_ON_COORDS)
+#endif
{
while (co_conn_count > 0)
{
@@ -3862,12 +5961,26 @@ ExecRemoteUtility(RemoteQuery *node)
while (i < co_conn_count)
{
+#ifdef XCP
+ int res = handle_response(pgxc_connections->coord_handles[i], combiner);
+#else
int res = handle_response(pgxc_connections->coord_handles[i], remotestate);
+#endif
if (res == RESPONSE_EOF)
{
i++;
}
else if (res == RESPONSE_COMPLETE)
+#ifdef XCP
+ {
+ /* Ignore, wait for ReadyForQuery */
+ }
+ else if (res == RESPONSE_ERROR)
+ {
+ /* Ignore, wait for ReadyForQuery */
+ }
+ else if (res == RESPONSE_READY)
+#endif
{
if (i < --co_conn_count)
pgxc_connections->coord_handles[i] =
@@ -3893,7 +6006,11 @@ ExecRemoteUtility(RemoteQuery *node)
* error message pending we can report it. All connections should be in
* consistent state now and so they can be released to the pool after ROLLBACK.
*/
+#ifdef XCP
+ pgxc_node_report_error(combiner);
+#else
pgxc_node_report_error(remotestate);
+#endif
}
@@ -3903,19 +6020,26 @@ ExecRemoteUtility(RemoteQuery *node)
void
PGXCNodeCleanAndRelease(int code, Datum arg)
{
+#ifndef XCP
/* Clean up prepared transactions before releasing connections */
DropAllPreparedStatements();
/* Release Datanode connections */
release_handles();
+#endif
- /* Disconnect from Pooler */
+ /* Disconnect from Pooler, if any connection is still held Pooler close it */
PoolManagerDisconnect();
/* Close connection with GTM */
CloseGTM();
+
+ /* Dump collected statistics to the log */
+ stat_log();
}
+
+#ifndef XCP
static int
pgxc_get_connections(PGXCNodeHandle *connections[], int size, List *connlist)
{
@@ -3939,13 +6063,19 @@ pgxc_get_transaction_nodes(PGXCNodeHandle *connections[], int size, bool write)
{
return pgxc_get_connections(connections, size, write ? XactWriteNodes : XactReadNodes);
}
+#endif
+
void
ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
{
PGXCNodeAllHandles *all_handles;
PGXCNodeHandle **connections;
+#ifdef XCP
+ ResponseCombiner combiner;
+#else
RemoteQueryState *combiner;
+#endif
int conn_count;
int i;
@@ -3983,7 +6113,15 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
}
}
+#ifdef XCP
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+#else
combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE);
+#endif
while (conn_count > 0)
{
@@ -3999,24 +6137,39 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
i = 0;
while (i < conn_count)
{
+#ifdef XCP
+ int res = handle_response(connections[i], &combiner);
+#else
int res = handle_response(connections[i], combiner);
+#endif
if (res == RESPONSE_EOF)
{
i++;
}
+#ifdef XCP
+ else if (res == RESPONSE_READY ||
+ connections[i]->state == DN_CONNECTION_STATE_ERROR_FATAL)
+#else
else if (res == RESPONSE_COMPLETE)
+#endif
{
if (--conn_count > i)
connections[i] = connections[conn_count];
}
+#ifndef XCP
else
{
connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL;
}
+#endif
}
}
+#ifdef XCP
+ ValidateAndCloseCombiner(&combiner);
+#else
ValidateAndCloseCombiner(combiner);
+#endif
pfree_pgxc_all_handles(all_handles);
}
@@ -4025,14 +6178,23 @@ ExecCloseRemoteStatement(const char *stmt_name, List *nodelist)
*
* In a COPY TO, send to all Datanodes PG_HEADER for a COPY TO in binary mode.
*/
+#ifdef XCP
+int
+DataNodeCopyInBinaryForAll(char *msg_buf, int len, int conn_count,
+ PGXCNodeHandle** connections)
+#else
int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_connections)
+#endif
{
int i;
+#ifndef XCP
int conn_count = 0;
PGXCNodeHandle *connections[NumDataNodes];
+#endif
int msgLen = 4 + len + 1;
int nLen = htonl(msgLen);
+#ifndef XCP
for (i = 0; i < NumDataNodes; i++)
{
PGXCNodeHandle *handle = copy_connections[i];
@@ -4042,6 +6204,7 @@ int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_con
connections[conn_count++] = handle;
}
+#endif
for (i = 0; i < conn_count; i++)
{
@@ -4073,6 +6236,7 @@ int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_con
return 0;
}
+#ifndef XCP
/*
* ExecSetTempObjectIncluded
*
@@ -4106,138 +6270,38 @@ ExecIsTempObjectIncluded(void)
}
/*
- * ExecProcNodeDMLInXC
- *
- * This function is used by ExecInsert/Update/Delete to execute the
- * Insert/Update/Delete on the datanode using RemoteQuery plan.
- *
- * In XC, a non-FQSed UPDATE/DELETE is planned as a two step process
- * The first step selects the ctid & node id of the row to be modified and the
- * second step creates a parameterized query that is supposed to take the data
- * row returned by the lower plan node as the parameters to modify the affected
- * row. In case of an INSERT however the first step is used to get the new
- * column values to be inserted in the target table and the second step uses
- * those values as parameters of the INSERT query.
- *
- * We use extended query protocol to avoid repeated planning of the query and
- * pass the column values(in case of an INSERT) and ctid & xc_node_id
- * (in case of UPDATE/DELETE) as parameters while executing the query.
- *
- * Parameters:
- * resultRemoteRel: The RemoteQueryState containing DML statement to be
- * executed
- * previousStepSlot: The tuple returned by the first step (described above)
- * to be used as parameters in the second step.
- *
- * Returns the result of RETURNING clause if any
+ * Execute given tuple in the remote relation. We use extended query protocol
+ * to avoid repeated planning of the query. So we must pass the column values
+ * as parameters while executing the query.
+ * This is used by queries using a remote query planning of standard planner.
*/
-TupleTableSlot *
-ExecProcNodeDMLInXC(RemoteQueryState *resultRemoteRel,
- TupleTableSlot *previousStepSlot)
+void
+ExecRemoteQueryStandard(Relation resultRelationDesc,
+ RemoteQueryState *resultRemoteRel,
+ TupleTableSlot *slot)
{
- ExprContext *econtext = resultRemoteRel->ss.ps.ps_ExprContext;
- TupleTableSlot *returningResultSlot = NULL; /* RETURNING clause result */
- TupleTableSlot *temp_slot;
- bool dml_returning_on_replicated = false;
- RemoteQuery *step = (RemoteQuery *) resultRemoteRel->ss.ps.plan;
-
- /*
- * If the tuple returned by the previous step was null,
- * simply return null tuple, no need to execute the DML
- */
- if (TupIsNull(previousStepSlot))
- return NULL;
-
- /*
- * The current implementation of DMLs with RETURNING when run on replicated
- * tables returns row from one of the datanodes. In order to achieve this
- * ExecProcNode is repeatedly called saving one tuple and rejecting the rest.
- * Do we have a DML on replicated table with RETURNING?
- */
- dml_returning_on_replicated = IsReturningDMLOnReplicatedTable(step);
-
- /*
- * Use data row returned by the previous step as parameter for
- * the DML to be executed in this step.
- */
- SetDataRowForIntParams(previousStepSlot, resultRemoteRel);
+ ExprContext *econtext = resultRemoteRel->ss.ps.ps_ExprContext;
/*
- * do_query calls get_exec_connections to determine target nodes
- * at execution time. The function get_exec_connections can decide
- * to evaluate en_expr to determine the target nodes. To evaluate en_expr,
- * ExecEvalVar is called which picks up values from ecxt_scantuple if Var
- * does not refer either OUTER or INNER varno. Hence we should copy the
- * tuple returned by previous step in ecxt_scantuple if econtext is set.
- * The econtext is set only when en_expr is set for execution time
- * determination of the target nodes.
+ * Use data row returned by the previous step as a parameters for
+ * the main query.
*/
- if (econtext)
- econtext->ecxt_scantuple = previousStepSlot;
-
- /*
- * Consider the case of a non FQSed INSERT for example. The executor keeps
- * track of # of tuples processed in es_processed member of EState structure.
- * When a non-FQSed INSERT completes this member is increased once due to
- * estate->es_processed += rowcount
- * in HandleCommandComplete and once due to
- * (estate->es_processed)++
- * in ExecInsert. The result is that although only one row is inserted we
- * get message as if two rows got inserted INSERT 0 2. Now consider the
- * same INSERT case when it is FQSed. In this case the # of tuples processed
- * is increased just once in HandleCommandComplete since ExecInsert is never
- * called in this case and hence we get correct output i.e. INSERT 0 1
- * To handle this error in processed tuple counting we use a variable
- * non_fqs_dml which indicates whether this DML is FQSed or not. To indicate
- * that this DML is not FQSed non_fqs_dml is set to true here and then if
- * it is found true in HandleCommandComplete we skip handling of
- * es_processed there and let ExecInsert do the processed tuple counting.
- */
- resultRemoteRel->non_fqs_dml = true;
-
- /*
- * This loop would be required to reject tuples received from datanodes
- * when a DML with RETURNING is run on a replicated table otherwise it
- * would run once.
- * PGXC_TODO: This approach is error prone if the DML statement constructed
- * by the planner is such that it updates more than one row (even in case of
- * non-replicated data). Fix it.
- */
- do
+ if (!TupIsNull(slot))
{
- temp_slot = ExecProcNode((PlanState *)resultRemoteRel);
- if (!TupIsNull(temp_slot))
- {
- /* Have we already copied the returned tuple? */
- if (returningResultSlot == NULL)
- {
- /* Copy the received tuple to be returned later */
- returningResultSlot = MakeSingleTupleTableSlot(temp_slot->tts_tupleDescriptor);
- returningResultSlot = ExecCopySlot(returningResultSlot, temp_slot);
- }
- /* Clear the received tuple, the copy required has already been saved */
- ExecClearTuple(temp_slot);
- }
- else
- {
- /* Null tuple received, so break the loop */
- ExecClearTuple(temp_slot);
- break;
- }
- } while (dml_returning_on_replicated);
+ resultRemoteRel->paramval_len = ExecCopySlotDatarow(slot,
+ &resultRemoteRel->paramval_data);
- /*
- * A DML can impact more than one row, e.g. an update without any where
- * clause on a table with more than one row. We need to make sure that
- * RemoteQueryNext calls do_query for each affected row, hence we reset
- * the flag here and finish the DML being executed only when we return
- * NULL from ExecModifyTable
- */
- resultRemoteRel->query_Done = false;
-
- return returningResultSlot;
+ /*
+ * The econtext is set only when en_expr is set for execution time
+ * evalulation of the target node.
+ */
+ if (econtext)
+ econtext->ecxt_scantuple = slot;
+ do_query(resultRemoteRel);
+ }
}
+
void
RegisterTransactionNodes(int count, void **connections, bool write)
{
@@ -4277,6 +6341,7 @@ ForgetTransactionNodes(void)
list_free(XactWriteNodes);
XactWriteNodes = NIL;
}
+#endif
/*
* Clear per transaction remote information
@@ -4284,11 +6349,57 @@ ForgetTransactionNodes(void)
void
AtEOXact_Remote(void)
{
+#ifdef XCP
+ PGXCNodeResetParams(true);
+#else
ExecClearTempObjectIncluded();
ForgetTransactionNodes();
clear_RemoteXactState();
+#endif
}
+#ifdef XCP
+/*
+ * Invoked when local transaction is about to be committed.
+ * If nodestring is specified commit specified prepared transaction on remote
+ * nodes, otherwise commit remote nodes which are in transaction.
+ */
+void
+PreCommit_Remote(char *prepareGID, char *nodestring, bool preparedLocalNode)
+{
+ /*
+ * Made node connections persistent if we are committing transaction
+ * that touched temporary tables. We never drop that flag, so after some
+ * transaction has created a temp table the session's remote connections
+ * become persistent.
+ * We do not need to set that flag if transaction that has created a temp
+ * table finally aborts - remote connections are not holding temporary
+ * objects in this case.
+ */
+ if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && MyXactAccessedTempRel)
+ temp_object_included = true;
+
+
+ /*
+ * OK, everything went fine. At least one remote node is in PREPARED state
+ * and the transaction is successfully prepared on all the involved nodes.
+ * Now we are ready to commit the transaction. We need a new GXID to send
+ * down the remote nodes to execute the forthcoming COMMIT PREPARED
+ * command. So grab one from the GTM and track it. It will be closed along
+ * with the main transaction at the end.
+ */
+ if (nodestring)
+ {
+ Assert(preparedLocalNode);
+ pgxc_node_remote_finish(prepareGID, true, nodestring,
+ GetAuxilliaryTransactionId(),
+ GetTopGlobalTransactionId());
+
+ }
+ else
+ pgxc_node_remote_commit();
+}
+#else
/*
* Do pre-commit processing for remote nodes which includes Datanodes and
* Coordinators. If more than one nodes are involved in the transaction write
@@ -4354,6 +6465,7 @@ PreCommit_Remote(char *prepareGID, bool preparedLocalNode)
if (!PersistentConnections)
release_handles();
}
+#endif
/*
* Do abort processing for the transaction. We must abort the transaction on
@@ -4371,6 +6483,104 @@ PreCommit_Remote(char *prepareGID, bool preparedLocalNode)
bool
PreAbort_Remote(void)
{
+#ifdef XCP
+ /*
+ * We are about to abort current transaction, and there could be an
+ * unexpected error leaving the node connection in some state requiring
+ * clean up, like COPY or pending query results.
+ * If we are running copy we should send down CopyFail message and read
+ * all possible incoming messages, there could be copy rows (if running
+ * COPY TO) ErrorResponse, ReadyForQuery.
+ * If there are pending results (connection state is DN_CONNECTION_STATE_QUERY)
+ * we just need to read them in and discard, all necessary commands are
+ * already sent. The end of input could be CommandComplete or
+ * PortalSuspended, in either case subsequent ROLLBACK closes the portal.
+ */
+ PGXCNodeAllHandles *all_handles;
+ PGXCNodeHandle *clean_nodes[NumCoords + NumDataNodes];
+ int node_count = 0;
+ int i;
+
+ all_handles = get_current_handles();
+ /*
+ * Find "dirty" coordinator connections.
+ * COPY is never running on a coordinator connections, we just check for
+ * pending data.
+ */
+ for (i = 0; i < all_handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *handle = all_handles->coord_handles[i];
+
+ if (handle->state == DN_CONNECTION_STATE_QUERY)
+ {
+ /*
+ * Forget previous combiner if any since input will be handled by
+ * different one.
+ */
+ handle->combiner = NULL;
+ clean_nodes[node_count++] = handle;
+ }
+ }
+
+ /*
+ * The same for data nodes, but cancel COPY if it is running.
+ */
+ for (i = 0; i < all_handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *handle = all_handles->datanode_handles[i];
+
+ if (handle->state == DN_CONNECTION_STATE_QUERY)
+ {
+ /*
+ * Forget previous combiner if any since input will be handled by
+ * different one.
+ */
+ handle->combiner = NULL;
+ clean_nodes[node_count++] = handle;
+ }
+ else if (handle->state == DN_CONNECTION_STATE_COPY_IN ||
+ handle->state == DN_CONNECTION_STATE_COPY_OUT)
+ {
+ DataNodeCopyEnd(handle, true);
+ clean_nodes[node_count++] = handle;
+ }
+ }
+
+ pfree_pgxc_all_handles(all_handles);
+
+ /*
+ * Now read and discard any data from the connections found "dirty"
+ */
+ if (node_count > 0)
+ {
+ ResponseCombiner combiner;
+
+ InitResponseCombiner(&combiner, node_count, COMBINE_TYPE_NONE);
+ /*
+ * Make sure there are zeroes in unused fields
+ */
+ memset(&combiner, 0, sizeof(ScanState));
+ combiner.connections = clean_nodes;
+ combiner.conn_count = node_count;
+ combiner.request_type = REQUEST_TYPE_ERROR;
+
+ pgxc_connections_cleanup(&combiner);
+
+ /* prevent pfree'ing local variable */
+ combiner.connections = NULL;
+
+ CloseCombiner(&combiner);
+ }
+
+ pgxc_node_remote_abort();
+
+ if (!temp_object_included && !PersistentConnections)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+#else
if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
{
cancel_query();
@@ -4423,13 +6633,62 @@ PreAbort_Remote(void)
if (!PersistentConnections)
release_handles();
+#endif
return true;
}
+
+/*
+ * Invoked when local transaction is about to be prepared.
+ * If invoked on a Datanode just commit transaction on remote connections,
+ * since secondary sessions are read only and never need to be prepared.
+ * Otherwise run PREPARE on remote connections, where writable commands were
+ * sent (connections marked as not read-only).
+ * If that is explicit PREPARE (issued by client) notify GTM.
+ * In case of implicit PREPARE not involving local node (ex. caused by
+ * INSERT, UPDATE or DELETE) commit prepared transaction immediately.
+ * Return list of node names where transaction was actually prepared, include
+ * the name of the local node if localNode is true.
+ */
char *
PrePrepare_Remote(char *prepareGID, bool localNode, bool implicit)
{
+#ifdef XCP
+ /* Always include local node if running explicit prepare */
+ char *nodestring;
+
+ /*
+ * Primary session is doing 2PC, just commit secondary processes and exit
+ */
+ if (IS_PGXC_DATANODE)
+ {
+ pgxc_node_remote_commit();
+ return NULL;
+ }
+
+ nodestring = pgxc_node_remote_prepare(prepareGID,
+ !implicit || localNode);
+
+ if (!implicit && IS_PGXC_COORDINATOR && !IsConnFromCoord())
+ /* Save the node list and gid on GTM. */
+ StartPreparedTranGTM(GetTopGlobalTransactionId(), prepareGID,
+ nodestring);
+
+ /*
+ * If no need to commit on local node go ahead and commit prepared
+ * transaction right away.
+ */
+ if (implicit && !localNode && nodestring)
+ {
+ pgxc_node_remote_finish(prepareGID, true, nodestring,
+ GetAuxilliaryTransactionId(),
+ GetTopGlobalTransactionId());
+ pfree(nodestring);
+ return NULL;
+ }
+ return nodestring;
+#else
init_RemoteXactState(false);
/*
* PREPARE the transaction on all nodes including remote nodes as well as
@@ -4446,8 +6705,21 @@ PrePrepare_Remote(char *prepareGID, bool localNode, bool implicit)
preparedNodes = pgxc_node_get_nodelist(true);
return preparedNodes;
+#endif
}
+#ifdef XCP
+/*
+ * Invoked immediately after local node is prepared.
+ * Notify GTM about completed prepare.
+ */
+void
+PostPrepare_Remote(char *prepareGID, bool implicit)
+{
+ if (!implicit)
+ PrepareTranGTM(GetTopGlobalTransactionId());
+}
+#else
void
PostPrepare_Remote(char *prepareGID, char *nodestring, bool implicit)
{
@@ -4472,7 +6744,9 @@ PostPrepare_Remote(char *prepareGID, char *nodestring, bool implicit)
/* Now forget the transaction nodes */
ForgetTransactionNodes();
}
+#endif
+#ifndef XCP
/*
* Return the list of nodes where the prepared transaction is not yet committed
*/
@@ -4522,10 +6796,65 @@ pgxc_node_get_nodelist(bool localNode)
return nodestring;
}
+#endif
+
+#ifdef XCP
+/*
+ * Returns true if 2PC is required for consistent commit: if there was write
+ * activity on two or more nodes within current transaction.
+ */
bool
IsTwoPhaseCommitRequired(bool localWrite)
{
+ PGXCNodeAllHandles *handles;
+ bool found = localWrite;
+ int i;
+
+ /* Never run 2PC on Datanode-to-Datanode connection */
+ if (IS_PGXC_DATANODE)
+ return false;
+
+ if (MyXactAccessedTempRel)
+ {
+ elog(DEBUG1, "Transaction accessed temporary objects - "
+ "2PC will not be used and that can lead to data inconsistencies "
+ "in case of failures");
+ return false;
+ }
+
+ handles = get_current_handles();
+ for (i = 0; i < handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->datanode_handles[i];
+ if (conn->sock != NO_SOCKET && !conn->read_only &&
+ conn->transaction_status == 'T')
+ {
+ if (found)
+ return true; /* second found */
+ else
+ found = true; /* first found */
+ }
+ }
+ for (i = 0; i < handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = handles->coord_handles[i];
+ if (conn->sock != NO_SOCKET && !conn->read_only &&
+ conn->transaction_status == 'T')
+ {
+ if (found)
+ return true; /* second found */
+ else
+ found = true; /* first found */
+ }
+ }
+ return false;
+}
+#else
+bool
+IsTwoPhaseCommitRequired(bool localWrite)
+{
+
if ((list_length(XactWriteNodes) > 1) ||
((list_length(XactWriteNodes) == 1) && localWrite))
{
@@ -4555,21 +6884,12 @@ clear_RemoteXactState(void)
if ((remoteXactState.remoteNodeHandles == NULL) ||
(remoteXactState.maxRemoteNodes < (NumDataNodes + NumCoords)))
{
- if (!remoteXactState.remoteNodeHandles)
- remoteXactState.remoteNodeHandles = (PGXCNodeHandle **)
- malloc(sizeof(PGXCNodeHandle *) * (MaxDataNodes + MaxCoords));
- else
- remoteXactState.remoteNodeHandles = (PGXCNodeHandle **)
- realloc(remoteXactState.remoteNodeHandles,
- sizeof(PGXCNodeHandle *) * (NumDataNodes + NumCoords));
- if (!remoteXactState.remoteNodeStatus)
- remoteXactState.remoteNodeStatus = (RemoteXactNodeStatus *)
- malloc(sizeof(RemoteXactNodeStatus) * (MaxDataNodes + MaxCoords));
- else
- remoteXactState.remoteNodeStatus = (RemoteXactNodeStatus *)
- realloc (remoteXactState.remoteNodeStatus,
- sizeof(RemoteXactNodeStatus) * (NumDataNodes + NumCoords));
-
+ remoteXactState.remoteNodeHandles = (PGXCNodeHandle **)
+ realloc (remoteXactState.remoteNodeHandles,
+ sizeof (PGXCNodeHandle *) * (NumDataNodes + NumCoords));
+ remoteXactState.remoteNodeStatus = (RemoteXactNodeStatus *)
+ realloc (remoteXactState.remoteNodeStatus,
+ sizeof (RemoteXactNodeStatus) * (NumDataNodes + NumCoords));
remoteXactState.maxRemoteNodes = NumDataNodes + NumCoords;
}
@@ -4606,16 +6926,30 @@ init_RemoteXactState(bool preparedLocalNode)
remoteXactState.numReadRemoteNodes = read_conn_count;
}
+#endif
+
+/*
+ * Execute COMMIT/ABORT PREPARED issued by the remote client on remote nodes.
+ * Contacts GTM for the list of involved nodes and for work complete
+ * notification. Returns true if prepared transaction on local node needs to be
+ * finished too.
+ */
bool
FinishRemotePreparedTransaction(char *prepareGID, bool commit)
{
+#ifdef XCP
+ char *nodestring;
+ GlobalTransactionId gxid, prepare_gxid;
+ bool prepared_local = false;
+#else
char *nodename, *nodestring;
List *nodelist = NIL, *coordlist = NIL;
GlobalTransactionId gxid, prepare_gxid;
PGXCNodeAllHandles *pgxc_handles;
bool prepared_local = false;
int i;
+#endif
/*
* Please note that with xc_maintenance_mode = on, COMMIT/ROLLBACK PREPARED will not
@@ -4655,7 +6989,44 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit)
(errcode(ERRCODE_INTERNAL_ERROR),
errmsg("prepared transaction with identifier \"%s\" does not exist",
prepareGID)));
+#ifdef XCP
+ prepared_local = pgxc_node_remote_finish(prepareGID, commit, nodestring,
+ gxid, prepare_gxid);
+ if (commit)
+ {
+ CommitPreparedTranGTM(prepare_gxid, gxid);
+ }
+ else
+ {
+ RollbackTranGTM(prepare_gxid);
+ RollbackTranGTM(gxid);
+ }
+
+ return prepared_local;
+}
+
+
+/*
+ * Complete previously prepared transactions on remote nodes.
+ * Release remote connection after completion.
+ */
+static bool
+pgxc_node_remote_finish(char *prepareGID, bool commit,
+ char *nodestring, GlobalTransactionId gxid,
+ GlobalTransactionId prepare_gxid)
+{
+ char finish_cmd[256];
+ PGXCNodeHandle *connections[MaxCoords + MaxDataNodes];
+ int conn_count = 0;
+ ResponseCombiner combiner;
+ PGXCNodeAllHandles *pgxc_handles;
+ bool prepared_local = false;
+ char *nodename;
+ List *nodelist = NIL;
+ List *coordlist = NIL;
+ int i;
+#endif
/*
* Now based on the nodestring, run COMMIT/ROLLBACK PREPARED command on the
* remote nodes and also finish the transaction locally is required
@@ -4663,6 +7034,19 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit)
nodename = strtok(nodestring, ",");
while (nodename != NULL)
{
+#ifdef XCP
+ int nodeIndex;
+ char nodetype;
+
+ /* Get node type and index */
+ nodetype = PGXC_NODE_NONE;
+ nodeIndex = PGXCNodeGetNodeIdFromName(nodename, &nodetype);
+ if (nodetype == PGXC_NODE_NONE)
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("PGXC Node %s: object not defined",
+ nodename)));
+#else
Oid nodeoid;
int nodeIndex;
char nodetype;
@@ -4678,6 +7062,7 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit)
/* Get node type and index */
nodetype = get_pgxc_nodetype(nodeoid);
nodeIndex = PGXCNodeGetNodeId(nodeoid, get_pgxc_nodetype(nodeoid));
+#endif
/* Check if node is requested is the self-node or not */
if (nodetype == PGXC_NODE_COORDINATOR)
@@ -4693,6 +7078,104 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit)
nodename = strtok(NULL, ",");
}
+#ifdef XCP
+ if (nodelist == NIL && coordlist == NIL)
+ return prepared_local;
+
+ pgxc_handles = get_handles(nodelist, coordlist, false);
+
+ if (commit)
+ sprintf(finish_cmd, "COMMIT PREPARED '%s'", prepareGID);
+ else
+ sprintf(finish_cmd, "ROLLBACK PREPARED '%s'", prepareGID);
+
+ for (i = 0; i < pgxc_handles->dn_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = pgxc_handles->datanode_handles[i];
+
+ if (pgxc_node_send_gxid(conn, gxid))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send GXID for %s PREPARED command",
+ commit ? "COMMIT" : "ROLLBACK")));
+ }
+
+ if (pgxc_node_send_query(conn, finish_cmd))
+ {
+ /*
+ * Do not bother with clean up, just bomb out. The error handler
+ * will invoke RollbackTransaction which will do the work.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send %s PREPARED command to the node %u",
+ commit ? "COMMIT" : "ROLLBACK", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+
+ for (i = 0; i < pgxc_handles->co_conn_count; i++)
+ {
+ PGXCNodeHandle *conn = pgxc_handles->coord_handles[i];
+
+ if (pgxc_node_send_gxid(conn, gxid))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send GXID for %s PREPARED command",
+ commit ? "COMMIT" : "ROLLBACK")));
+ }
+
+ if (pgxc_node_send_query(conn, finish_cmd))
+ {
+ /*
+ * Do not bother with clean up, just bomb out. The error handler
+ * will invoke RollbackTransaction which will do the work.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to send %s PREPARED command to the node %u",
+ commit ? "COMMIT" : "ROLLBACK", conn->nodeoid)));
+ }
+ else
+ {
+ /* Read responses from these */
+ connections[conn_count++] = conn;
+ }
+ }
+
+ if (conn_count)
+ {
+ InitResponseCombiner(&combiner, conn_count, COMBINE_TYPE_NONE);
+ /* Receive responses */
+ if (pgxc_node_receive_responses(conn_count, connections, NULL, &combiner) ||
+ !validate_combiner(&combiner))
+ {
+ if (combiner.errorMessage)
+ pgxc_node_report_error(&combiner);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to COMMIT the transaction on one or more nodes")));
+ }
+ else
+ CloseCombiner(&combiner);
+ }
+
+ pfree_pgxc_all_handles(pgxc_handles);
+
+ if (!temp_object_included && !PersistentConnections)
+ {
+ /* Clean up remote sessions */
+ pgxc_node_remote_cleanup_all();
+ release_handles();
+ }
+#else
/*
* Now get handles for all the involved Datanodes and the Coordinators
*/
@@ -4756,16 +7239,1800 @@ FinishRemotePreparedTransaction(char *prepareGID, bool commit)
*/
clear_RemoteXactState();
ForgetTransactionNodes();
+#endif
return prepared_local;
}
+
+#ifdef XCP
+/*****************************************************************************
+ *
+ * Simplified versions of ExecInitRemoteQuery, ExecRemoteQuery and
+ * ExecEndRemoteQuery: in XCP they are only used to execute simple queries.
+ *
+ *****************************************************************************/
+RemoteQueryState *
+ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags)
+{
+ RemoteQueryState *remotestate;
+ ResponseCombiner *combiner;
+
+ remotestate = makeNode(RemoteQueryState);
+ combiner = (ResponseCombiner *) remotestate;
+ InitResponseCombiner(combiner, 0, COMBINE_TYPE_NONE);
+ combiner->ss.ps.plan = (Plan *) node;
+ combiner->ss.ps.state = estate;
+
+ combiner->ss.ps.qual = NIL;
+
+ combiner->request_type = REQUEST_TYPE_QUERY;
+
+ ExecInitResultTupleSlot(estate, &combiner->ss.ps);
+ if (node->scan.plan.targetlist)
+ ExecAssignResultTypeFromTL((PlanState *) remotestate);
+
+ /*
+ * If there are parameters supplied, get them into a form to be sent to the
+ * datanodes with bind message. We should not have had done this before.
+ */
+ if (estate->es_param_list_info)
+ {
+ Assert(!remotestate->paramval_data);
+ remotestate->paramval_len = ParamListToDataRow(estate->es_param_list_info,
+ &remotestate->paramval_data);
+ }
+
+ /* We need expression context to evaluate */
+ if (node->exec_nodes && node->exec_nodes->en_expr)
+ {
+ Expr *expr = node->exec_nodes->en_expr;
+
+ if (IsA(expr, Var) && ((Var *) expr)->vartype == TIDOID)
+ {
+ /* Special case if expression does not need to be evaluated */
+ }
+ else
+ {
+ /* prepare expression evaluation */
+ ExecAssignExprContext(estate, &combiner->ss.ps);
+ }
+ }
+
+ return remotestate;
+}
+
+
+/*
+ * Execute step of PGXC plan.
+ * The step specifies a command to be executed on specified nodes.
+ * On first invocation connections to the data nodes are initialized and
+ * command is executed. Further, as well as within subsequent invocations,
+ * responses are received until step is completed or there is a tuple to emit.
+ * If there is a tuple it is returned, otherwise returned NULL. The NULL result
+ * from the function indicates completed step.
+ * The function returns at most one tuple per invocation.
+ */
+TupleTableSlot *
+ExecRemoteQuery(RemoteQueryState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *) node;
+ RemoteQuery *step = (RemoteQuery *) combiner->ss.ps.plan;
+ TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot;
+ if (!node->query_Done)
+ {
+ GlobalTransactionId gxid = InvalidGlobalTransactionId;
+ Snapshot snapshot = GetActiveSnapshot();
+ PGXCNodeHandle **connections = NULL;
+ PGXCNodeHandle *primaryconnection = NULL;
+ int i;
+ int regular_conn_count = 0;
+ int total_conn_count = 0;
+ bool need_tran_block;
+ PGXCNodeAllHandles *pgxc_connections;
+
+ /*
+ * Get connections for Datanodes only, utilities and DDLs
+ * are launched in ExecRemoteUtility
+ */
+ pgxc_connections = get_exec_connections(node, step->exec_nodes,
+ step->exec_type);
+
+ if (step->exec_type == EXEC_ON_DATANODES)
+ {
+ connections = pgxc_connections->datanode_handles;
+ total_conn_count = regular_conn_count = pgxc_connections->dn_conn_count;
+ }
+ else if (step->exec_type == EXEC_ON_COORDS)
+ {
+ connections = pgxc_connections->coord_handles;
+ total_conn_count = regular_conn_count = pgxc_connections->co_conn_count;
+ }
+
+ primaryconnection = pgxc_connections->primary_handle;
+
+ /*
+ * Primary connection is counted separately but is included in total_conn_count if used.
+ */
+ if (primaryconnection)
+ regular_conn_count--;
+
+ pfree(pgxc_connections);
+
+ /*
+ * We save only regular connections, at the time we exit the function
+ * we finish with the primary connection and deal only with regular
+ * connections on subsequent invocations
+ */
+ combiner->node_count = regular_conn_count;
+
+ /*
+ * Start transaction on data nodes if we are in explicit transaction
+ * or going to use extended query protocol or write to multiple nodes
+ */
+ if (step->force_autocommit)
+ need_tran_block = false;
+ else
+ need_tran_block = step->cursor ||
+ (!step->read_only && total_conn_count > 1) ||
+ (TransactionBlockStatusCode() == 'T');
+
+ stat_statement();
+ stat_transaction(total_conn_count);
+
+ gxid = GetCurrentTransactionId();
+
+ if (!GlobalTransactionIdIsValid(gxid))
+ {
+ if (primaryconnection)
+ pfree(primaryconnection);
+ pfree(connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to get next transaction ID")));
+ }
+
+ /* See if we have a primary node, execute on it first before the others */
+ if (primaryconnection)
+ {
+ if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block,
+ step->read_only, PGXC_NODE_DATANODE))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not begin transaction on data node.")));
+
+ /* If explicit transaction is needed gxid is already sent */
+ if (!pgxc_start_command_on_connection(primaryconnection, node, snapshot))
+ {
+ pfree(connections);
+ pfree(primaryconnection);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+ Assert(combiner->combine_type == COMBINE_TYPE_SAME);
+
+ pgxc_node_receive(1, &primaryconnection, NULL);
+ /* Make sure the command is completed on the primary node */
+ while (true)
+ {
+ int res = handle_response(primaryconnection, combiner);
+ if (res == RESPONSE_READY)
+ break;
+ else if (res == RESPONSE_EOF)
+ pgxc_node_receive(1, &primaryconnection, NULL);
+ else if (res == RESPONSE_COMPLETE || res == RESPONSE_ERROR)
+ /* Get ReadyForQuery */
+ continue;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Unexpected response from data node")));
+ }
+ if (combiner->errorMessage)
+ {
+ char *code = combiner->errorCode;
+ if (combiner->errorDetail != NULL)
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner->errorMessage), errdetail("%s", combiner->errorDetail) ));
+ else
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner->errorMessage)));
+ }
+ }
+
+ for (i = 0; i < regular_conn_count; i++)
+ {
+ if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block,
+ step->read_only, PGXC_NODE_DATANODE))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not begin transaction on data node.")));
+
+ /* If explicit transaction is needed gxid is already sent */
+ if (!pgxc_start_command_on_connection(connections[i], node, snapshot))
+ {
+ pfree(connections);
+ if (primaryconnection)
+ pfree(primaryconnection);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+ connections[i]->combiner = combiner;
+ }
+
+ if (step->cursor)
+ {
+ combiner->cursor = step->cursor;
+ combiner->cursor_count = regular_conn_count;
+ combiner->cursor_connections = (PGXCNodeHandle **) palloc(regular_conn_count * sizeof(PGXCNodeHandle *));
+ memcpy(combiner->cursor_connections, connections, regular_conn_count * sizeof(PGXCNodeHandle *));
+ }
+
+ combiner->connections = connections;
+ combiner->conn_count = regular_conn_count;
+ combiner->current_conn = 0;
+
+ if (combiner->cursor_count)
+ {
+ combiner->conn_count = combiner->cursor_count;
+ memcpy(connections, combiner->cursor_connections,
+ combiner->cursor_count * sizeof(PGXCNodeHandle *));
+ combiner->connections = connections;
+ }
+
+ node->query_Done = true;
+
+ if (step->sort)
+ {
+ SimpleSort *sort = step->sort;
+
+ /*
+ * First message is already in the buffer
+ * Further fetch will be under tuplesort control
+ * If query does not produce rows tuplesort will not
+ * be initialized
+ */
+ combiner->tuplesortstate = tuplesort_begin_merge(
+ resultslot->tts_tupleDescriptor,
+ sort->numCols,
+ sort->sortColIdx,
+ sort->sortOperators,
+ sort->sortCollations,
+ sort->nullsFirst,
+ combiner,
+ work_mem);
+ }
+ }
+
+ if (combiner->tuplesortstate)
+ {
+ if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate,
+ true, resultslot))
+ return resultslot;
+ else
+ ExecClearTuple(resultslot);
+ }
+ else
+ {
+ TupleTableSlot *slot = FetchTuple(combiner);
+ if (!TupIsNull(slot))
+ return slot;
+ }
+
+ if (combiner->errorMessage)
+ {
+ char *code = combiner->errorCode;
+ if (combiner->errorDetail != NULL)
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner->errorMessage), errdetail("%s", combiner->errorDetail) ));
+ else
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner->errorMessage)));
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Clean up and discard any data on the data node connections that might not
+ * handled yet, including pending on the remote connection.
+ */
+static void
+pgxc_connections_cleanup(ResponseCombiner *combiner)
+{
+ /* clean up the buffer */
+ list_free_deep(combiner->rowBuffer);
+ combiner->rowBuffer = NIL;
+
+ /*
+ * Read in and discard remaining data from the connections, if any
+ */
+ combiner->current_conn = 0;
+ while (combiner->conn_count > 0)
+ {
+ int res;
+ PGXCNodeHandle *conn = combiner->connections[combiner->current_conn];
+
+ /*
+ * Possible if we are doing merge sort.
+ * We can do usual procedure and move connections around since we are
+ * cleaning up and do not care what connection at what position
+ */
+ if (conn == NULL)
+ {
+ REMOVE_CURR_CONN(combiner);
+ continue;
+ }
+
+ /* throw away current message that may be in the buffer */
+ if (combiner->currentRow)
+ {
+ pfree(combiner->currentRow);
+ combiner->currentRow = NULL;
+ }
+
+ /* no data is expected */
+ if (conn->state == DN_CONNECTION_STATE_IDLE ||
+ conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ {
+ REMOVE_CURR_CONN(combiner);
+ continue;
+ }
+
+ /*
+ * Connection owner is different, so no our data pending at
+ * the connection, nothing to read in.
+ */
+ if (conn->combiner && conn->combiner != combiner)
+ {
+ REMOVE_CURR_CONN(combiner);
+ continue;
+ }
+
+ res = handle_response(conn, combiner);
+ if (res == RESPONSE_EOF)
+ {
+ struct timeval timeout;
+#ifdef XCP
+ timeout.tv_sec = END_QUERY_TIMEOUT / 1000;
+ timeout.tv_usec = (END_QUERY_TIMEOUT % 1000) * 1000;
+#else
+ timeout.tv_sec = END_QUERY_TIMEOUT;
+ timeout.tv_usec = 0;
+#endif
+
+ if (pgxc_node_receive(1, &conn, &timeout))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to read response from data nodes when ending query")));
+ }
+ }
+
+ /*
+ * Release tuplesort resources
+ */
+ if (combiner->tuplesortstate)
+ {
+ /*
+ * Free these before tuplesort_end, because these arrays may appear
+ * in the tuplesort's memory context, tuplesort_end deletes this
+ * context and may invalidate the memory.
+ * We still want to free them here, because these may be in different
+ * context.
+ */
+ if (combiner->tapenodes)
+ {
+ pfree(combiner->tapenodes);
+ combiner->tapenodes = NULL;
+ }
+ if (combiner->tapemarks)
+ {
+ pfree(combiner->tapemarks);
+ combiner->tapemarks = NULL;
+ }
+ /*
+ * tuplesort_end invalidates minimal tuple if it is in the slot because
+ * deletes the TupleSort memory context, causing seg fault later when
+ * releasing tuple table
+ */
+ ExecClearTuple(combiner->ss.ps.ps_ResultTupleSlot);
+ tuplesort_end((Tuplesortstate *) combiner->tuplesortstate);
+ combiner->tuplesortstate = NULL;
+ }
+}
+
+
+/*
+ * End the remote query
+ */
+void
+ExecEndRemoteQuery(RemoteQueryState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *) node;
+
+ /*
+ * Clean up remote connections
+ */
+ pgxc_connections_cleanup(combiner);
+
+ /*
+ * Clean up parameters if they were set, since plan may be reused
+ */
+ if (node->paramval_data)
+ {
+ pfree(node->paramval_data);
+ node->paramval_data = NULL;
+ node->paramval_len = 0;
+ }
+
+ CloseCombiner(combiner);
+ pfree(node);
+}
+
+
+/**********************************************
+ *
+ * Routines to support RemoteSubplan plan node
+ *
+ **********************************************/
+
+
+/*
+ * The routine walks recursively over the plan tree and changes cursor names of
+ * RemoteSubplan nodes to make them different from launched from the other
+ * datanodes. The routine changes cursor names in place, so caller should
+ * take writable copy of the plan tree.
+ */
+void
+RemoteSubplanMakeUnique(Node *plan, int unique)
+{
+ if (plan == NULL)
+ return;
+
+ if (IsA(plan, List))
+ {
+ ListCell *lc;
+ foreach(lc, (List *) plan)
+ {
+ RemoteSubplanMakeUnique(lfirst(lc), unique);
+ }
+ return;
+ }
+
+ /*
+ * Transform SharedQueue name
+ */
+ if (IsA(plan, RemoteSubplan))
+ {
+ ((RemoteSubplan *)plan)->unique = unique;
+ }
+ /* Otherwise it is a Plan descendant */
+ RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->initPlan, unique);
+ RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->lefttree, unique);
+ RemoteSubplanMakeUnique((Node *) ((Plan *) plan)->righttree, unique);
+ /* Tranform special cases */
+ switch (nodeTag(plan))
+ {
+ case T_Append:
+ RemoteSubplanMakeUnique((Node *) ((Append *) plan)->appendplans,
+ unique);
+ break;
+ case T_MergeAppend:
+ RemoteSubplanMakeUnique((Node *) ((MergeAppend *) plan)->mergeplans,
+ unique);
+ break;
+ case T_BitmapAnd:
+ RemoteSubplanMakeUnique((Node *) ((BitmapAnd *) plan)->bitmapplans,
+ unique);
+ break;
+ case T_BitmapOr:
+ RemoteSubplanMakeUnique((Node *) ((BitmapOr *) plan)->bitmapplans,
+ unique);
+ break;
+ case T_SubqueryScan:
+ RemoteSubplanMakeUnique((Node *) ((SubqueryScan *) plan)->subplan,
+ unique);
+ break;
+ default:
+ break;
+ }
+}
+
+struct find_params_context
+{
+ RemoteParam *rparams;
+ Bitmapset *defineParams;
+};
+
+static bool
+determine_param_types_walker(Node *node, struct find_params_context *context)
+{
+ if (node == NULL)
+ return false;
+
+ if (IsA(node, Param))
+ {
+ Param *param = (Param *) node;
+ int paramno = param->paramid;
+
+ if (param->paramkind == PARAM_EXEC &&
+ bms_is_member(paramno, context->defineParams))
+ {
+ RemoteParam *cur = context->rparams;
+ while (cur->paramkind != PARAM_EXEC || cur->paramid != paramno)
+ cur++;
+ cur->paramtype = param->paramtype;
+ context->defineParams = bms_del_member(context->defineParams,
+ paramno);
+ return bms_is_empty(context->defineParams);
+ }
+ }
+ return expression_tree_walker(node, determine_param_types_walker,
+ (void *) context);
+
+}
+
+/*
+ * Scan expressions in the plan tree to find Param nodes and get data types
+ * from them
+ */
+static bool
+determine_param_types(Plan *plan, struct find_params_context *context)
+{
+ Bitmapset *intersect;
+
+ if (plan == NULL)
+ return false;
+
+ intersect = bms_intersect(plan->allParam, context->defineParams);
+ if (bms_is_empty(intersect))
+ {
+ /* the subplan does not depend on params we are interested in */
+ bms_free(intersect);
+ return false;
+ }
+ bms_free(intersect);
+
+ /* scan target list */
+ if (expression_tree_walker((Node *) plan->targetlist,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ /* scan qual */
+ if (expression_tree_walker((Node *) plan->qual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+
+ /* Check additional node-type-specific fields */
+ switch (nodeTag(plan))
+ {
+ case T_Result:
+ if (expression_tree_walker((Node *) ((Result *) plan)->resconstantqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_SeqScan:
+ break;
+
+ case T_IndexScan:
+ if (expression_tree_walker((Node *) ((IndexScan *) plan)->indexqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_IndexOnlyScan:
+ if (expression_tree_walker((Node *) ((IndexOnlyScan *) plan)->indexqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_BitmapIndexScan:
+ if (expression_tree_walker((Node *) ((BitmapIndexScan *) plan)->indexqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_BitmapHeapScan:
+ if (expression_tree_walker((Node *) ((BitmapHeapScan *) plan)->bitmapqualorig,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_TidScan:
+ if (expression_tree_walker((Node *) ((TidScan *) plan)->tidquals,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_SubqueryScan:
+ if (determine_param_types(((SubqueryScan *) plan)->subplan, context))
+ return true;
+ break;
+
+ case T_FunctionScan:
+ if (expression_tree_walker((Node *) ((FunctionScan *) plan)->funcexpr,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_ValuesScan:
+ if (expression_tree_walker((Node *) ((ValuesScan *) plan)->values_lists,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_ModifyTable:
+ {
+ ListCell *l;
+
+ foreach(l, ((ModifyTable *) plan)->plans)
+ {
+ if (determine_param_types((Plan *) lfirst(l), context))
+ return true;
+ }
+ }
+ break;
+
+ case T_RemoteSubplan:
+ break;
+
+ case T_Append:
+ {
+ ListCell *l;
+
+ foreach(l, ((Append *) plan)->appendplans)
+ {
+ if (determine_param_types((Plan *) lfirst(l), context))
+ return true;
+ }
+ }
+ break;
+
+ case T_BitmapAnd:
+ {
+ ListCell *l;
+
+ foreach(l, ((BitmapAnd *) plan)->bitmapplans)
+ {
+ if (determine_param_types((Plan *) lfirst(l), context))
+ return true;
+ }
+ }
+ break;
+
+ case T_BitmapOr:
+ {
+ ListCell *l;
+
+ foreach(l, ((BitmapOr *) plan)->bitmapplans)
+ {
+ if (determine_param_types((Plan *) lfirst(l), context))
+ return true;
+ }
+ }
+ break;
+
+ case T_NestLoop:
+ if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_MergeJoin:
+ if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ if (expression_tree_walker((Node *) ((MergeJoin *) plan)->mergeclauses,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_HashJoin:
+ if (expression_tree_walker((Node *) ((Join *) plan)->joinqual,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ if (expression_tree_walker((Node *) ((HashJoin *) plan)->hashclauses,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_Limit:
+ if (expression_tree_walker((Node *) ((Limit *) plan)->limitOffset,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ if (expression_tree_walker((Node *) ((Limit *) plan)->limitCount,
+ determine_param_types_walker,
+ (void *) context))
+ return true;
+ break;
+
+ case T_RecursiveUnion:
+ break;
+
+ case T_LockRows:
+ break;
+
+ case T_WindowAgg:
+ if (expression_tree_walker((Node *) ((WindowAgg *) plan)->startOffset,
+ determine_param_types_walker,
+ (void *) context))
+ if (expression_tree_walker((Node *) ((WindowAgg *) plan)->endOffset,
+ determine_param_types_walker,
+ (void *) context))
+ break;
+
+ case T_Hash:
+ case T_Agg:
+ case T_Material:
+ case T_Sort:
+ case T_Unique:
+ case T_SetOp:
+ case T_Group:
+ break;
+
+ default:
+ elog(ERROR, "unrecognized node type: %d",
+ (int) nodeTag(plan));
+ }
+
+
+ /* recurse into subplans */
+ return determine_param_types(plan->lefttree, context) ||
+ determine_param_types(plan->righttree, context);
+}
+
+
+RemoteSubplanState *
+ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags)
+{
+ RemoteStmt rstmt;
+ RemoteSubplanState *remotestate;
+ ResponseCombiner *combiner;
+ CombineType combineType;
+
+ remotestate = makeNode(RemoteSubplanState);
+ combiner = (ResponseCombiner *) remotestate;
+ /*
+ * We do not need to combine row counts if we will receive intermediate
+ * results or if we won't return row count.
+ */
+ if (IS_PGXC_DATANODE || estate->es_plannedstmt->commandType == CMD_SELECT)
+ {
+ combineType = COMBINE_TYPE_NONE;
+ remotestate->execOnAll = node->execOnAll;
+ }
+ else
+ {
+ if (node->execOnAll)
+ combineType = COMBINE_TYPE_SUM;
+ else
+ combineType = COMBINE_TYPE_SAME;
+ /*
+ * If we are updating replicated table we should run plan on all nodes.
+ * We are choosing single node only to read
+ */
+ remotestate->execOnAll = true;
+ }
+ remotestate->execNodes = list_copy(node->nodeList);
+ InitResponseCombiner(combiner, 0, combineType);
+ combiner->ss.ps.plan = (Plan *) node;
+ combiner->ss.ps.state = estate;
+
+ combiner->ss.ps.qual = NIL;
+
+ combiner->request_type = REQUEST_TYPE_QUERY;
+
+ ExecInitResultTupleSlot(estate, &combiner->ss.ps);
+ ExecAssignResultTypeFromTL((PlanState *) remotestate);
+
+ /*
+ * We optimize execution if we going to send down query to next level
+ */
+ remotestate->local_exec = false;
+ if (IS_PGXC_DATANODE)
+ {
+ if (remotestate->execNodes == NIL)
+ {
+ /*
+ * Special case, if subplan is not distributed, like Result, or
+ * query against catalog tables only.
+ * We are only interested in filtering out the subplan results and
+ * get only those we are interested in.
+ * XXX we may want to prevent multiple executions in this case
+ * either, to achieve this we will set single execNode on planning
+ * time and this case would never happen, this code branch could
+ * be removed.
+ */
+ remotestate->local_exec = true;
+ }
+ else if (!remotestate->execOnAll)
+ {
+ /*
+ * XXX We should change planner and remove this flag.
+ * We want only one node is producing the replicated result set,
+ * and planner should choose that node - it is too hard to determine
+ * right node at execution time, because it should be guaranteed
+ * that all consumers make the same decision.
+ * For now always execute replicated plan on local node to save
+ * resources.
+ */
+
+ /*
+ * Make sure local node is in execution list
+ */
+ if (list_member_int(remotestate->execNodes, PGXCNodeId-1))
+ {
+ list_free(remotestate->execNodes);
+ remotestate->execNodes = NIL;
+ remotestate->local_exec = true;
+ }
+ else
+ {
+ /*
+ * To support, we need to connect to some producer, so
+ * each producer should be prepared to serve rows for random
+ * number of consumers. It is hard, because new consumer may
+ * connect after producing is started, on the other hand,
+ * absence of expected consumer is a problem too.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("Getting replicated results from remote node is not supported")));
+ }
+ }
+ }
+
+ /*
+ * If we are going to execute subplan locally or doing explain initialize
+ * the subplan. Otherwise have remote node doing that.
+ */
+ if (remotestate->local_exec || (eflags & EXEC_FLAG_EXPLAIN_ONLY))
+ {
+ outerPlanState(remotestate) = ExecInitNode(outerPlan(node), estate,
+ eflags);
+ if (node->distributionNodes)
+ {
+ Oid distributionType = InvalidOid;
+ TupleDesc typeInfo;
+
+ typeInfo = combiner->ss.ps.ps_ResultTupleSlot->tts_tupleDescriptor;
+ if (node->distributionKey != InvalidAttrNumber)
+ {
+ Form_pg_attribute attr;
+ attr = typeInfo->attrs[node->distributionKey - 1];
+ distributionType = attr->atttypid;
+ }
+ /* Set up locator */
+ remotestate->locator = createLocator(node->distributionType,
+ RELATION_ACCESS_INSERT,
+ distributionType,
+ LOCATOR_LIST_LIST,
+ 0,
+ (void *) node->distributionNodes,
+ (void **) &remotestate->dest_nodes,
+ false);
+ }
+ else
+ remotestate->locator = NULL;
+ }
+
+ /*
+ * Encode subplan if it will be sent to remote nodes
+ */
+ if (remotestate->execNodes && !(eflags & EXEC_FLAG_EXPLAIN_ONLY))
+ {
+ ParamListInfo ext_params;
+ /* Encode plan if we are going to execute it on other nodes */
+ rstmt.type = T_RemoteStmt;
+ if (node->distributionType == LOCATOR_TYPE_NONE && IS_PGXC_DATANODE)
+ {
+ /*
+ * There are cases when planner can not determine distribution of a
+ * subplan, in particular it does not determine distribution of
+ * subquery nodes. Such subplans executed from current location
+ * (node) and combine all results, like from coordinator nodes.
+ * However, if there are multiple locations where distributed
+ * executor is running this node, and there are more of
+ * RemoteSubplan plan nodes in the subtree there will be a problem -
+ * Instances of the inner RemoteSubplan nodes will be using the same
+ * SharedQueue, causing error. To avoid this problem we should
+ * traverse the subtree and change SharedQueue name to make it
+ * unique.
+ */
+ RemoteSubplanMakeUnique((Node *) outerPlan(node), PGXCNodeId);
+ }
+ rstmt.planTree = outerPlan(node);
+ /*
+ * If datanode launch further execution of a command it should tell
+ * it is a SELECT, otherwise secondary data nodes won't return tuples
+ * expecting there will be nothing to return.
+ */
+ if (IsA(outerPlan(node), ModifyTable))
+ {
+ rstmt.commandType = estate->es_plannedstmt->commandType;
+ rstmt.hasReturning = estate->es_plannedstmt->hasReturning;
+ rstmt.resultRelations = estate->es_plannedstmt->resultRelations;
+ }
+ else
+ {
+ rstmt.commandType = CMD_SELECT;
+ rstmt.hasReturning = false;
+ rstmt.resultRelations = NIL;
+ }
+ rstmt.rtable = estate->es_range_table;
+ rstmt.subplans = estate->es_plannedstmt->subplans;
+ rstmt.nParamExec = estate->es_plannedstmt->nParamExec;
+ ext_params = estate->es_param_list_info;
+ rstmt.nParamRemote = (ext_params ? ext_params->numParams : 0) +
+ bms_num_members(node->scan.plan.allParam);
+ if (rstmt.nParamRemote > 0)
+ {
+ Bitmapset *tmpset;
+ int i;
+ int paramno;
+
+ /* Allocate enough space */
+ rstmt.remoteparams = (RemoteParam *) palloc(rstmt.nParamRemote *
+ sizeof(RemoteParam));
+ paramno = 0;
+ if (ext_params)
+ {
+ for (i = 0; i < ext_params->numParams; i++)
+ {
+ ParamExternData *param = &ext_params->params[i];
+ /*
+ * If parameter type is not yet defined but can be defined
+ * do that
+ */
+ if (!OidIsValid(param->ptype) && ext_params->paramFetch)
+ (*ext_params->paramFetch) (ext_params, i + 1);
+ /*
+ * If parameter type is still not defined assume it is
+ * unused
+ */
+ if (!OidIsValid(param->ptype))
+ continue;
+
+ rstmt.remoteparams[paramno].paramkind = PARAM_EXTERN;
+ rstmt.remoteparams[paramno].paramid = i + 1;
+ rstmt.remoteparams[paramno].paramtype = param->ptype;
+ paramno++;
+ }
+ /* store actual number of parameters */
+ rstmt.nParamRemote = paramno;
+ }
+
+ if (!bms_is_empty(node->scan.plan.allParam))
+ {
+ Bitmapset *defineParams = NULL;
+ tmpset = bms_copy(node->scan.plan.allParam);
+ while ((i = bms_first_member(tmpset)) >= 0)
+ {
+ ParamExecData *prmdata;
+
+ prmdata = &(estate->es_param_exec_vals[i]);
+ rstmt.remoteparams[paramno].paramkind = PARAM_EXEC;
+ rstmt.remoteparams[paramno].paramid = i;
+ rstmt.remoteparams[paramno].paramtype = prmdata->ptype;
+ /* Will scan plan tree to find out data type of the param */
+ if (prmdata->ptype == InvalidOid)
+ defineParams = bms_add_member(defineParams, i);
+ paramno++;
+ }
+ /* store actual number of parameters */
+ rstmt.nParamRemote = paramno;
+ bms_free(tmpset);
+ if (!bms_is_empty(defineParams))
+ {
+ struct find_params_context context;
+ bool all_found;
+
+ context.rparams = rstmt.remoteparams;
+ context.defineParams = defineParams;
+
+ all_found = determine_param_types(node->scan.plan.lefttree,
+ &context);
+ /*
+ * Remove not defined params from the list of remote params.
+ * If they are not referenced no need to send them down
+ */
+ if (!all_found)
+ {
+ for (i = 0; i < rstmt.nParamRemote; i++)
+ {
+ if (rstmt.remoteparams[i].paramkind == PARAM_EXEC &&
+ bms_is_member(rstmt.remoteparams[i].paramid,
+ context.defineParams))
+ {
+ /* Copy last parameter inplace */
+ rstmt.nParamRemote--;
+ if (i < rstmt.nParamRemote)
+ rstmt.remoteparams[i] =
+ rstmt.remoteparams[rstmt.nParamRemote];
+ /* keep current in the same position */
+ i--;
+ }
+ }
+ }
+ bms_free(context.defineParams);
+ }
+ }
+ remotestate->nParamRemote = rstmt.nParamRemote;
+ remotestate->remoteparams = rstmt.remoteparams;
+ }
+ else
+ rstmt.remoteparams = NULL;
+ rstmt.rowMarks = estate->es_plannedstmt->rowMarks;
+ rstmt.distributionKey = node->distributionKey;
+ rstmt.distributionType = node->distributionType;
+ rstmt.distributionNodes = node->distributionNodes;
+ rstmt.distributionRestrict = node->distributionRestrict;
+
+ set_portable_output(true);
+ remotestate->subplanstr = nodeToString(&rstmt);
+ set_portable_output(false);
+
+ /*
+ * Connect to remote nodes and send down subplan
+ */
+ if (!(eflags & EXEC_FLAG_SUBPLAN))
+ ExecFinishInitRemoteSubplan(remotestate);
+ }
+ remotestate->bound = false;
+ /*
+ * It does not makes sense to merge sort if there is only one tuple source.
+ * By the contract it is already sorted
+ */
+ if (node->sort && remotestate->execOnAll &&
+ list_length(remotestate->execNodes) > 1)
+ combiner->merge_sort = true;
+
+ return remotestate;
+}
+
+
+void
+ExecFinishInitRemoteSubplan(RemoteSubplanState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *) node;
+ RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ Oid *paramtypes = NULL;
+ GlobalTransactionId gxid = InvalidGlobalTransactionId;
+ Snapshot snapshot;
+ TimestampTz timestamp;
+ int i;
+ bool is_read_only;
+ char cursor[NAMEDATALEN];
+
+ /*
+ * Name is required to store plan as a statement
+ */
+ Assert(plan->cursor);
+
+ if (plan->unique)
+ snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+ else
+ strncpy(cursor, plan->cursor, NAMEDATALEN);
+
+ /* If it is alreaty fully initialized nothing to do */
+ if (combiner->connections)
+ return;
+
+ /* local only or explain only execution */
+ if (node->subplanstr == NULL)
+ return;
+
+ /*
+ * Acquire connections and send down subplan where it will be stored
+ * as a prepared statement.
+ * That does not require transaction id or snapshot, so does not send them
+ * here, postpone till bind.
+ */
+ if (node->execOnAll)
+ {
+ PGXCNodeAllHandles *pgxc_connections;
+ pgxc_connections = get_handles(node->execNodes, NIL, false);
+ combiner->conn_count = pgxc_connections->dn_conn_count;
+ combiner->connections = pgxc_connections->datanode_handles;
+ combiner->current_conn = 0;
+ pfree(pgxc_connections);
+ }
+ else
+ {
+ combiner->connections = (PGXCNodeHandle **) palloc(sizeof(PGXCNodeHandle *));
+ combiner->connections[0] = get_any_handle(node->execNodes);
+ combiner->conn_count = 1;
+ combiner->current_conn = 0;
+ }
+
+ gxid = GetCurrentTransactionId();
+ if (!GlobalTransactionIdIsValid(gxid))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to get next transaction ID")));
+ }
+
+ /* extract parameter data types */
+ if (node->nParamRemote > 0)
+ {
+ paramtypes = (Oid *) palloc(node->nParamRemote * sizeof(Oid));
+ for (i = 0; i < node->nParamRemote; i++)
+ paramtypes[i] = node->remoteparams[i].paramtype;
+ }
+ /* send down subplan */
+ snapshot = GetActiveSnapshot();
+ timestamp = GetCurrentGTMStartTimestamp();
+ /*
+ * Datanode should not send down statements that may modify
+ * the database. Potgres assumes that all sessions under the same
+ * postmaster have different xids. That may cause a locking problem.
+ * Shared locks acquired for reading still work fine.
+ */
+ is_read_only = IS_PGXC_DATANODE ||
+ !IsA(outerPlan(plan), ModifyTable);
+
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ PGXCNodeHandle *connection = combiner->connections[i];
+
+ if (pgxc_node_begin(1, &connection, gxid, true,
+ is_read_only, PGXC_NODE_DATANODE))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Could not begin transaction on data node.")));
+
+ if (pgxc_node_send_timestamp(connection, timestamp))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+ if (snapshot && pgxc_node_send_snapshot(connection, snapshot))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+ pgxc_node_send_plan(connection, cursor, "Remote Subplan",
+ node->subplanstr, node->nParamRemote, paramtypes);
+ if (pgxc_node_flush(connection))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send subplan to data nodes")));
+ }
+ }
+}
+
+
+static void
+append_param_data(StringInfo buf, Oid ptype, Datum value, bool isnull)
+{
+ uint32 n32;
+
+ if (isnull)
+ {
+ n32 = htonl(-1);
+ appendBinaryStringInfo(buf, (char *) &n32, 4);
+ }
+ else
+ {
+ Oid typOutput;
+ bool typIsVarlena;
+ Datum pval;
+ char *pstring;
+ int len;
+
+ /* Get info needed to output the value */
+ getTypeOutputInfo(ptype, &typOutput, &typIsVarlena);
+
+ /*
+ * If we have a toasted datum, forcibly detoast it here to avoid
+ * memory leakage inside the type's output routine.
+ */
+ if (typIsVarlena)
+ pval = PointerGetDatum(PG_DETOAST_DATUM(value));
+ else
+ pval = value;
+
+ /* Convert Datum to string */
+ pstring = OidOutputFunctionCall(typOutput, pval);
+
+ /* copy data to the buffer */
+ len = strlen(pstring);
+ n32 = htonl(len);
+ appendBinaryStringInfo(buf, (char *) &n32, 4);
+ appendBinaryStringInfo(buf, pstring, len);
+ }
+}
+
+
+static int encode_parameters(int nparams, RemoteParam *remoteparams,
+ PlanState *planstate, char** result)
+{
+ EState *estate = planstate->state;
+ StringInfoData buf;
+ uint16 n16;
+ int i;
+ ExprContext *econtext;
+ MemoryContext oldcontext;
+
+ if (planstate->ps_ExprContext == NULL)
+ ExecAssignExprContext(estate, planstate);
+
+ econtext = planstate->ps_ExprContext;
+ oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);
+ MemoryContextReset(econtext->ecxt_per_tuple_memory);
+
+ initStringInfo(&buf);
+
+ /* Number of parameter values */
+ n16 = htons(nparams);
+ appendBinaryStringInfo(&buf, (char *) &n16, 2);
+
+ /* Parameter values */
+ for (i = 0; i < nparams; i++)
+ {
+ RemoteParam *rparam = &remoteparams[i];
+ int ptype = rparam->paramtype;
+ if (rparam->paramkind == PARAM_EXTERN)
+ {
+ ParamExternData *param;
+ param = &(estate->es_param_list_info->params[rparam->paramid - 1]);
+ append_param_data(&buf, ptype, param->value, param->isnull);
+ }
+ else
+ {
+ ParamExecData *param;
+ param = &(estate->es_param_exec_vals[rparam->paramid]);
+ if (param->execPlan)
+ {
+ /* Parameter not evaluated yet, so go do it */
+ ExecSetParamPlan((SubPlanState *) param->execPlan,
+ planstate->ps_ExprContext);
+ /* ExecSetParamPlan should have processed this param... */
+ Assert(param->execPlan == NULL);
+ }
+ append_param_data(&buf, ptype, param->value, param->isnull);
+ }
+ }
+
+ /* Take data from the buffer */
+ *result = palloc(buf.len);
+ memcpy(*result, buf.data, buf.len);
+ MemoryContextSwitchTo(oldcontext);
+ return buf.len;
+}
+
+
+TupleTableSlot *
+ExecRemoteSubplan(RemoteSubplanState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *) node;
+ RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ EState *estate = combiner->ss.ps.state;
+ TupleTableSlot *resultslot = combiner->ss.ps.ps_ResultTupleSlot;
+
+primary_mode_phase_two:
+ if (!node->bound)
+ {
+ int fetch = 0;
+ int paramlen = 0;
+ char *paramdata = NULL;
+ /*
+ * Conditions when we want to execute query on the primary node first:
+ * Coordinator running replicated ModifyTable on multiple nodes
+ */
+ bool primary_mode = combiner->probing_primary ||
+ (IS_PGXC_COORDINATOR &&
+ combiner->combine_type == COMBINE_TYPE_SAME &&
+ OidIsValid(primary_data_node) &&
+ combiner->conn_count > 1);
+ char cursor[NAMEDATALEN];
+
+ if (plan->cursor)
+ {
+ fetch = 1000;
+ if (plan->unique)
+ snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+ else
+ strncpy(cursor, plan->cursor, NAMEDATALEN);
+ }
+ else
+ cursor[0] = '\0';
+
+ /*
+ * Send down all available parameters, if any is used by the plan
+ */
+ if (estate->es_param_list_info ||
+ !bms_is_empty(plan->scan.plan.allParam))
+ paramlen = encode_parameters(node->nParamRemote,
+ node->remoteparams,
+ &combiner->ss.ps,
+ &paramdata);
+
+ /*
+ * The subplan being rescanned, need to restore connections and
+ * re-bind the portal
+ */
+ if (combiner->cursor)
+ {
+ int i;
+
+ /*
+ * On second phase of primary mode connections are properly set,
+ * so do not copy.
+ */
+ if (!combiner->probing_primary)
+ {
+ combiner->conn_count = combiner->cursor_count;
+ memcpy(combiner->connections, combiner->cursor_connections,
+ combiner->cursor_count * sizeof(PGXCNodeHandle *));
+ }
+
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ PGXCNodeHandle *conn = combiner->connections[i];
+
+ CHECK_OWNERSHIP(conn, combiner);
+
+ /* close previous cursor only on phase 1 */
+ if (!primary_mode || !combiner->probing_primary)
+ pgxc_node_send_close(conn, false, combiner->cursor);
+
+ /*
+ * If we now should probe primary, skip execution on non-primary
+ * nodes
+ */
+ if (primary_mode && !combiner->probing_primary &&
+ conn->nodeoid != primary_data_node)
+ continue;
+
+ /* rebind */
+ pgxc_node_send_bind(conn, combiner->cursor, combiner->cursor,
+ paramlen, paramdata);
+ /* execute */
+ pgxc_node_send_execute(conn, combiner->cursor, fetch);
+ /* submit */
+ if (pgxc_node_send_flush(conn))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+
+ /*
+ * There could be only one primary node, but can not leave the
+ * loop now, because we need to close cursors.
+ */
+ if (primary_mode && !combiner->probing_primary)
+ {
+ combiner->current_conn = i;
+ }
+ }
+ }
+ else if (node->execNodes)
+ {
+ CommandId cid;
+ int i;
+
+ /*
+ * There are prepared statement, connections should be already here
+ */
+ Assert(combiner->conn_count > 0);
+
+ combiner->extended_query = true;
+ cid = estate->es_snapshot->curcid;
+
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ PGXCNodeHandle *conn = combiner->connections[i];
+
+ CHECK_OWNERSHIP(conn, combiner);
+
+ /*
+ * If we now should probe primary, skip execution on non-primary
+ * nodes
+ */
+ if (primary_mode && !combiner->probing_primary &&
+ conn->nodeoid != primary_data_node)
+ continue;
+
+ /*
+ * Update Command Id. Other command may be executed after we
+ * prepare and advanced Command Id. We should use one that
+ * was active at the moment when command started.
+ */
+ if (pgxc_node_send_cmd_id(conn, cid))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+
+ /* bind */
+ pgxc_node_send_bind(conn, cursor, cursor, paramlen, paramdata);
+ /* execute */
+ pgxc_node_send_execute(conn, cursor, fetch);
+ /* submit */
+ if (pgxc_node_send_flush(conn))
+ {
+ combiner->conn_count = 0;
+ pfree(combiner->connections);
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to send command to data nodes")));
+ }
+
+ /*
+ * There could be only one primary node, so if we executed
+ * subquery on the phase one of primary mode we can leave the
+ * loop now.
+ */
+ if (primary_mode && !combiner->probing_primary)
+ {
+ combiner->current_conn = i;
+ break;
+ }
+ }
+
+ /*
+ * On second phase of primary mode connections are backed up
+ * already, so do not copy.
+ */
+ if (primary_mode)
+ {
+ if (combiner->probing_primary)
+ {
+ combiner->cursor = pstrdup(cursor);
+ }
+ else
+ {
+ combiner->cursor_count = combiner->conn_count;
+ combiner->cursor_connections = (PGXCNodeHandle **) palloc(
+ combiner->conn_count * sizeof(PGXCNodeHandle *));
+ memcpy(combiner->cursor_connections, combiner->connections,
+ combiner->conn_count * sizeof(PGXCNodeHandle *));
+ }
+ }
+ else
+ {
+ combiner->cursor = pstrdup(cursor);
+ combiner->cursor_count = combiner->conn_count;
+ combiner->cursor_connections = (PGXCNodeHandle **) palloc(
+ combiner->conn_count * sizeof(PGXCNodeHandle *));
+ memcpy(combiner->cursor_connections, combiner->connections,
+ combiner->conn_count * sizeof(PGXCNodeHandle *));
+ }
+ }
+
+ if (combiner->merge_sort)
+ {
+ /*
+ * Requests are already made and sorter can fetch tuples to populate
+ * sort buffer.
+ */
+ combiner->tuplesortstate = tuplesort_begin_merge(
+ resultslot->tts_tupleDescriptor,
+ plan->sort->numCols,
+ plan->sort->sortColIdx,
+ plan->sort->sortOperators,
+ plan->sort->sortCollations,
+ plan->sort->nullsFirst,
+ combiner,
+ work_mem);
+ }
+ if (primary_mode)
+ {
+ if (combiner->probing_primary)
+ {
+ combiner->probing_primary = false;
+ node->bound = true;
+ }
+ else
+ combiner->probing_primary = true;
+ }
+ else
+ node->bound = true;
+ }
+
+ if (combiner->tuplesortstate)
+ {
+ if (tuplesort_gettupleslot((Tuplesortstate *) combiner->tuplesortstate,
+ true, resultslot))
+ return resultslot;
+ }
+ else
+ {
+ TupleTableSlot *slot = FetchTuple(combiner);
+ if (!TupIsNull(slot))
+ return slot;
+ else if (combiner->probing_primary)
+ /* phase1 is successfully completed, run on other nodes */
+ goto primary_mode_phase_two;
+ }
+ if (combiner->errorMessage)
+ {
+ char *code = combiner->errorCode;
+ if (combiner->errorDetail)
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner->errorMessage), errdetail("%s", combiner->errorDetail) ));
+ else
+ ereport(ERROR,
+ (errcode(MAKE_SQLSTATE(code[0], code[1], code[2], code[3], code[4])),
+ errmsg("%s", combiner->errorMessage)));
+ }
+ return NULL;
+}
+
+
+void
+ExecReScanRemoteSubplan(RemoteSubplanState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *)node;
+
+ /*
+ * If we haven't queried remote nodes yet, just return. If outerplan'
+ * chgParam is not NULL then it will be re-scanned by ExecProcNode,
+ * else - no reason to re-scan it at all.
+ */
+ if (!node->bound)
+ return;
+
+ /*
+ * If we execute locally rescan local copy of the plan
+ */
+ if (outerPlanState(node))
+ ExecReScan(outerPlanState(node));
+
+ /*
+ * Consume any possible pending input
+ */
+ pgxc_connections_cleanup(combiner);
+
+ /* misc cleanup */
+ combiner->command_complete_count = 0;
+ combiner->description_count = 0;
+
+ /*
+ * Force query is re-bound with new parameters
+ */
+ node->bound = false;
+}
+
+
+void
+ExecEndRemoteSubplan(RemoteSubplanState *node)
+{
+ ResponseCombiner *combiner = (ResponseCombiner *)node;
+ RemoteSubplan *plan = (RemoteSubplan *) combiner->ss.ps.plan;
+ int i;
+
+ if (outerPlanState(node))
+ ExecEndNode(outerPlanState(node));
+ if (node->locator)
+ freeLocator(node->locator);
+
+ /*
+ * Consume any possible pending input
+ */
+ if (node->bound)
+ pgxc_connections_cleanup(combiner);
+
+ /*
+ * Update coordinator statistics
+ */
+ if (IS_PGXC_COORDINATOR)
+ {
+ EState *estate = combiner->ss.ps.state;
+
+ if (estate->es_num_result_relations > 0 && estate->es_processed > 0)
+ {
+ switch (estate->es_plannedstmt->commandType)
+ {
+ case CMD_INSERT:
+ /* One statement can insert into only one relation */
+ pgstat_count_remote_insert(
+ estate->es_result_relations[0].ri_RelationDesc,
+ estate->es_processed);
+ break;
+ case CMD_UPDATE:
+ case CMD_DELETE:
+ {
+ /*
+ * We can not determine here how many row were updated
+ * or delete in each table, so assume same number of
+ * affected row in each table.
+ * If resulting number of rows is 0 because of rounding,
+ * increment each counter at least on 1.
+ */
+ int i;
+ int n;
+ bool update;
+
+ update = (estate->es_plannedstmt->commandType == CMD_UPDATE);
+ n = estate->es_processed / estate->es_num_result_relations;
+ if (n == 0)
+ n = 1;
+ for (i = 0; i < estate->es_num_result_relations; i++)
+ {
+ Relation r;
+ r = estate->es_result_relations[i].ri_RelationDesc;
+ if (update)
+ pgstat_count_remote_update(r, n);
+ else
+ pgstat_count_remote_delete(r, n);
+ }
+ }
+ break;
+ default:
+ /* nothing to count */
+ break;
+ }
+ }
+ }
+
+ /*
+ * Close portals. While cursors_connections exist there are open portals
+ */
+ if (combiner->cursor)
+ {
+ /* Restore connections where there are active statements */
+ combiner->conn_count = combiner->cursor_count;
+ memcpy(combiner->connections, combiner->cursor_connections,
+ combiner->cursor_count * sizeof(PGXCNodeHandle *));
+ for (i = 0; i < combiner->cursor_count; i++)
+ {
+ PGXCNodeHandle *conn;
+
+ conn = combiner->cursor_connections[i];
+
+ CHECK_OWNERSHIP(conn, combiner);
+
+ if (pgxc_node_send_close(conn, false, combiner->cursor) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to close data node cursor")));
+ }
+ /* The cursor stuff is not needed */
+ combiner->cursor = NULL;
+ combiner->cursor_count = 0;
+ pfree(combiner->cursor_connections);
+ combiner->cursor_connections = NULL;
+ }
+
+ /* Close statements, even if they never were bound */
+ for (i = 0; i < combiner->conn_count; i++)
+ {
+ PGXCNodeHandle *conn;
+ char cursor[NAMEDATALEN];
+
+ if (plan->cursor)
+ {
+ if (plan->unique)
+ snprintf(cursor, NAMEDATALEN, "%s_%d", plan->cursor, plan->unique);
+ else
+ strncpy(cursor, plan->cursor, NAMEDATALEN);
+ }
+ else
+ cursor[0] = '\0';
+
+ conn = combiner->connections[i];
+
+ CHECK_OWNERSHIP(conn, combiner);
+
+ if (pgxc_node_send_close(conn, true, cursor) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to close data node statement")));
+ /* Send SYNC and wait for ReadyForQuery */
+ if (pgxc_node_send_sync(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to synchronize data node")));
+ /*
+ * Formally connection is not in QUERY state, we set the state to read
+ * CloseDone and ReadyForQuery responses. Upon receiving ReadyForQuery
+ * state will be changed back to IDLE and conn->coordinator will be
+ * cleared.
+ */
+ conn->state = DN_CONNECTION_STATE_CLOSE;
+ }
+
+ while (combiner->conn_count > 0)
+ {
+ if (pgxc_node_receive(combiner->conn_count,
+ combiner->connections, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to close remote subplan")));
+ i = 0;
+ while (i < combiner->conn_count)
+ {
+ int res = handle_response(combiner->connections[i], combiner);
+ if (res == RESPONSE_EOF)
+ {
+ i++;
+ }
+ else if (res == RESPONSE_READY)
+ {
+ /* Done, connection is reade for query */
+ if (--combiner->conn_count > i)
+ combiner->connections[i] =
+ combiner->connections[combiner->conn_count];
+ }
+ else if (res == RESPONSE_DATAROW)
+ {
+ /*
+ * If we are finishing slowly running remote subplan while it
+ * is still working (because of Limit, for example) it may
+ * produce one or more tuples between connection cleanup and
+ * handling Close command. One tuple does not cause any problem,
+ * but if it will not be read the next tuple will trigger
+ * assertion failure. So if we got a tuple, just read and
+ * discard it here.
+ */
+ pfree(combiner->currentRow);
+ combiner->currentRow = NULL;
+ }
+ /* Ignore other possible responses */
+ }
+ }
+
+ ValidateAndCloseCombiner(combiner);
+ pfree(node);
+}
+#endif
+
+
/*
* pgxc_node_report_error
* Throw error from Datanode if any.
*/
+#ifdef XCP
+static void
+pgxc_node_report_error(ResponseCombiner *combiner)
+#else
static void
pgxc_node_report_error(RemoteQueryState *combiner)
+#endif
{
/* If no combiner, nothing to do */
if (!combiner)
@@ -4885,231 +9152,3 @@ void AtEOXact_DBCleanup(bool isCommit)
dbcleanup_info.fparams = NULL;
}
}
-
-static TupleTableSlot *
-getrow_for_tapesort(RemoteQueryState *combiner, TupleTableSlot *scanslot)
-{
- int tapenum = combiner->rqs_tapenum;
- PGXCNodeHandle *conn = combiner->connections[tapenum];
- /*
- * If connection is active (potentially has data to read) we can get node
- * number from the connection. If connection is not active (we have read all
- * available data rows) and if we have buffered data from that connection
- * the node number is stored in combiner->tapenodes[tapenum].
- * If connection is inactive and no buffered data we have EOF condition
- */
- int nid;
- ListCell *lc;
- ListCell *prev = NULL;
-
- /* May it ever happen ?! */
- if (!conn && !combiner->tapenodes)
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("Failed to fetch from data node cursor")));
-
- nid = conn ? PGXCNodeGetNodeId(conn->nodeoid, PGXC_NODE_DATANODE) : combiner->tapenodes[tapenum];
-
- if (nid < 0)
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("Node id %d is incorrect", nid)));
-
- /*
- * If there are buffered rows iterate over them and get first from
- * the requested tape
- */
- foreach (lc, combiner->rowBuffer)
- {
- RemoteDataRow dataRow = (RemoteDataRow) lfirst(lc);
- if (dataRow->msgnode == nid)
- {
- combiner->currentRow = *dataRow;
- combiner->rowBuffer = list_delete_cell(combiner->rowBuffer, lc, prev);
- CopyDataRowTupleToSlot(combiner, scanslot);
- return scanslot;
- }
- prev = lc;
- }
-
- /* Nothing is found in the buffer, check for EOF */
- if (conn == NULL)
- {
- ExecClearTuple(scanslot);
- return scanslot;
- }
-
- /* The connection is executing a query but not for this RemoteQueryState.
- * Before sending the query, it must have buffered the rows for the query of
- * this RemoteQueryState, which we have consumed already. So nothing do
- * here. Just return a NULL tuple and mark the connection as done
- */
- if (conn->state == DN_CONNECTION_STATE_QUERY && conn->combiner != combiner)
- {
- combiner->connections[tapenum] = NULL;
- ExecClearTuple(scanslot);
- return scanslot;
- }
-
- /* Read data from the connection until get a row or EOF */
- for (;;)
- {
- switch (handle_response(conn, combiner))
- {
- case RESPONSE_SUSPENDED:
- /* Send Execute to request next row */
- Assert(combiner->cursor);
- if (pgxc_node_send_execute(conn, combiner->cursor, 1) != 0)
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("Failed to fetch from data node cursor")));
- if (pgxc_node_send_sync(conn) != 0)
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("Failed to fetch from data node cursor")));
- conn->state = DN_CONNECTION_STATE_QUERY;
- conn->combiner = combiner;
- /* fallthru */
- case RESPONSE_EOF:
- /* receive more data */
- if (pgxc_node_receive(1, &conn, NULL))
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("%s", conn->error)));
- break;
-
- case RESPONSE_COMPLETE:
- combiner->connections[tapenum] = NULL;
- ExecClearTuple(scanslot);
- return scanslot;
- break;
-
- case RESPONSE_DATAROW:
- CopyDataRowTupleToSlot(combiner, scanslot);
- return scanslot;
- break;
-
- default:
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("Unexpected response from the data nodes")));
- }
- }
-
- /*
- * Didn't get any row and also didn't get a RESPONSE_COMPLETE (otherwise we
- * would have returned from there with this tape nullified). This should
- * never happen. Throw an error.
- */
- ereport(ERROR,
- (errcode(ERRCODE_INTERNAL_ERROR),
- errmsg("Did not get response complete message for the connection.")));
-}
-
-
-/* --------------------------------
- * SetDataRowForIntParams: Form a BIND data row for internal parameters.
- * This function is called when the data for the parameters of remote
- * statement resides in some plan slot of an internally generated remote
- * statement rather than from some extern params supplied by the caller of the
- * query. Currently DML is the only case where we generate a query with
- * internal parameters.
- * The parameter data is constructed from the slot data, and stored in
- * RemoteQueryState.paramval_data.
- * At the same time, remote parameter types are inferred from the slot
- * tuple descriptor, and stored in RemoteQueryState.rqs_param_types.
- * On subsequent calls, these param types are re-used.
- * The slot itself is undisturbed.
- * --------------------------------
- */
-static void
-SetDataRowForIntParams(TupleTableSlot *slot, RemoteQueryState *rq_state)
-{
- TupleDesc tdesc = slot->tts_tupleDescriptor;
- int att_index;
-
- Assert(tdesc != NULL);
-
- /*
- * Infer param types from the tuple desc. But we have to do it only the
- * first time: the interal parameters remain the same while processing all
- * the source data rows because the data slot tupdesc never changes.
- * Even though we can determine the internal param types during planning, we
- * want to do it here: we don't want to set the param types and param data
- * at two different places. Doing them together here helps us to make sure
- * that the order of param types are in line with the order of the param
- * data.
- */
- if (rq_state->rqs_num_params == 0)
- {
- rq_state->rqs_num_params = tdesc->natts;
- rq_state->rqs_param_types =
- (Oid *) palloc(sizeof(Oid) * rq_state->rqs_num_params);
- for (att_index = 0; att_index < rq_state->rqs_num_params; att_index++)
- rq_state->rqs_param_types[att_index] = tdesc->attrs[att_index]->atttypid;
- }
-
- /* if we already have datarow make a copy */
- if (slot->tts_dataRow)
- {
- rq_state->paramval_data = (char *)palloc(slot->tts_dataLen);
- memcpy(rq_state->paramval_data, slot->tts_dataRow, slot->tts_dataLen);
- rq_state->paramval_len = slot->tts_dataLen;
- }
- else
- {
- StringInfoData buf;
- uint16 n16;
-
- initStringInfo(&buf);
- /* Number of parameter values */
- n16 = htons(tdesc->natts);
- appendBinaryStringInfo(&buf, (char *) &n16, 2);
-
- /* ensure we have all values */
- slot_getallattrs(slot);
- for (att_index = 0; att_index < tdesc->natts; att_index++)
- {
- uint32 n32;
-
- if (slot->tts_isnull[att_index])
- {
- n32 = htonl(-1);
- appendBinaryStringInfo(&buf, (char *) &n32, 4);
- }
- else
- {
- Form_pg_attribute attr = tdesc->attrs[att_index];
- Oid typOutput;
- bool typIsVarlena;
- Datum pval;
- char *pstring;
- int len;
-
- /* Get info needed to output the value */
- getTypeOutputInfo(attr->atttypid, &typOutput, &typIsVarlena);
- /*
- * If we have a toasted datum, forcibly detoast it here to avoid
- * memory leakage inside the type's output routine.
- */
- if (typIsVarlena)
- pval = PointerGetDatum(PG_DETOAST_DATUM(slot->tts_values[att_index]));
- else
- pval = slot->tts_values[att_index];
-
- /* Convert Datum to string */
- pstring = OidOutputFunctionCall(typOutput, pval);
-
- /* copy data to the buffer */
- len = strlen(pstring);
- n32 = htonl(len);
- appendBinaryStringInfo(&buf, (char *) &n32, 4);
- appendBinaryStringInfo(&buf, pstring, len);
- }
- }
-
- /* Assign the newly allocated data row to paramval */
- rq_state->paramval_data = buf.data;
- rq_state->paramval_len = buf.len;
- }
-}
diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c
index 2dd460efd6..0431a2257b 100644
--- a/src/backend/pgxc/pool/pgxcnode.c
+++ b/src/backend/pgxc/pool/pgxcnode.c
@@ -6,6 +6,11 @@
* Datanodes and Coordinators
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
@@ -50,6 +55,12 @@
#include "utils/lsyscache.h"
#include "utils/formatting.h"
#include "../interfaces/libpq/libpq-fe.h"
+#ifdef XCP
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "pgxc/pause.h"
+#include "utils/snapmgr.h"
+#endif
#define CMD_ID_MSG_LEN 8
@@ -75,6 +86,30 @@ static PGXCNodeHandle *co_handles = NULL;
int NumDataNodes;
int NumCoords;
+
+#ifdef XCP
+volatile bool HandlesInvalidatePending = false;
+
+/*
+ * Session and transaction parameters need to to be set on newly connected
+ * remote nodes.
+ */
+static HTAB *session_param_htab = NULL;
+static HTAB *local_param_htab = NULL;
+static StringInfo session_params;
+static StringInfo local_params;
+
+typedef struct
+{
+ NameData name;
+ NameData value;
+} ParamEntry;
+
+
+static bool DoInvalidateRemoteHandles(void);
+#endif
+
+
static void pgxc_node_init(PGXCNodeHandle *handle, int sock);
static void pgxc_node_free(PGXCNodeHandle *handle);
static void pgxc_node_all_free(void);
@@ -100,6 +135,7 @@ init_pgxc_handle(PGXCNodeHandle *pgxc_handle)
pgxc_handle->outSize = 16 * 1024;
pgxc_handle->outBuffer = (char *) palloc(pgxc_handle->outSize);
pgxc_handle->inSize = 16 * 1024;
+
pgxc_handle->inBuffer = (char *) palloc(pgxc_handle->inSize);
pgxc_handle->combiner = NULL;
pgxc_handle->inStart = 0;
@@ -124,6 +160,10 @@ InitMultinodeExecutor(bool is_force)
{
int count;
Oid *coOids, *dnOids;
+#ifdef XCP
+ MemoryContext oldcontext;
+#endif
+
/* Free all the existing information first */
if (is_force)
@@ -140,6 +180,14 @@ InitMultinodeExecutor(bool is_force)
/* Get classified list of node Oids */
PgxcNodeGetOids(&coOids, &dnOids, &NumCoords, &NumDataNodes, true);
+#ifdef XCP
+ /*
+ * Coordinator and datanode handles should be available during all the
+ * session lifetime
+ */
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+#endif
+
/* Do proper initialization of handles */
if (NumDataNodes > 0)
dn_handles = (PGXCNodeHandle *)
@@ -170,6 +218,28 @@ InitMultinodeExecutor(bool is_force)
coord_count = 0;
PGXCNodeId = 0;
+#ifdef XCP
+ MemoryContextSwitchTo(oldcontext);
+
+ if (IS_PGXC_COORDINATOR)
+ {
+ for (count = 0; count < NumCoords; count++)
+ {
+ if (pg_strcasecmp(PGXCNodeName,
+ get_pgxc_nodename(co_handles[count].nodeoid)) == 0)
+ PGXCNodeId = count + 1;
+ }
+ }
+ else /* DataNode */
+ {
+ for (count = 0; count < NumDataNodes; count++)
+ {
+ if (pg_strcasecmp(PGXCNodeName,
+ get_pgxc_nodename(dn_handles[count].nodeoid)) == 0)
+ PGXCNodeId = count + 1;
+ }
+ }
+#else
/* Finally determine which is the node-self */
for (count = 0; count < NumCoords; count++)
{
@@ -186,6 +256,7 @@ InitMultinodeExecutor(bool is_force)
ereport(ERROR,
(errcode(ERRCODE_DATA_EXCEPTION),
errmsg("Coordinator cannot identify itself")));
+#endif
}
@@ -193,8 +264,13 @@ InitMultinodeExecutor(bool is_force)
* Builds up a connection string
*/
char *
+#ifdef XCP
+PGXCNodeConnStr(char *host, int port, char *dbname,
+ char *user, char *remote_type, char *parent_node)
+#else
PGXCNodeConnStr(char *host, int port, char *dbname,
char *user, char *pgoptions, char *remote_type)
+#endif
{
char *out,
connstr[256];
@@ -204,9 +280,15 @@ PGXCNodeConnStr(char *host, int port, char *dbname,
* Build up connection string
* remote type can be Coordinator, Datanode or application.
*/
+#ifdef XCP
+ num = snprintf(connstr, sizeof(connstr),
+ "host=%s port=%d dbname=%s user=%s application_name=pgxc sslmode=disable options='-c remotetype=%s -c parentnode=%s'",
+ host, port, dbname, user, remote_type, parent_node);
+#else
num = snprintf(connstr, sizeof(connstr),
"host=%s port=%d dbname=%s user=%s application_name=pgxc options='-c remotetype=%s %s'",
host, port, dbname, user, remote_type, pgoptions);
+#endif
/* Check for overflow */
if (num > 0 && num < sizeof(connstr))
@@ -246,6 +328,8 @@ PGXCNodeClose(NODE_CONNECTION *conn)
PQfinish((PGconn *) conn);
}
+
+#ifndef XCP
/*
* Send SET query to given connection.
* Query is sent asynchronously and results are consumed
@@ -267,6 +351,7 @@ PGXCNodeSendSetQuery(NODE_CONNECTION *conn, const char *sql_command)
return 0;
}
+#endif
/*
@@ -338,6 +423,9 @@ pgxc_node_all_free(void)
co_handles = NULL;
dn_handles = NULL;
+#ifdef XCP
+ HandlesInvalidatePending = false;
+#endif
}
/*
@@ -348,9 +436,17 @@ pgxc_node_all_free(void)
static void
pgxc_node_init(PGXCNodeHandle *handle, int sock)
{
+#ifdef XCP
+ char *init_str;
+#endif
+
handle->sock = sock;
handle->transaction_status = 'I';
handle->state = DN_CONNECTION_STATE_IDLE;
+#ifdef XCP
+ handle->read_only = true;
+ handle->ck_resp_rollback = false;
+#endif
handle->combiner = NULL;
#ifdef DN_CONNECTION_DEBUG
handle->have_row_desc = false;
@@ -360,6 +456,17 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock)
handle->inStart = 0;
handle->inEnd = 0;
handle->inCursor = 0;
+#ifdef XCP
+ /*
+ * We got a new connection, set on the remote node the session parameters
+ * if defined. The transaction parameter should be sent after BEGIN
+ */
+ init_str = PGXCNodeGetSessionParamStr();
+ if (init_str)
+ {
+ pgxc_node_set_query(handle, init_str);
+ }
+#endif
}
@@ -422,6 +529,9 @@ pgxc_node_receive(const int conn_count,
}
retry:
+#ifdef XCP
+ CHECK_FOR_INTERRUPTS();
+#endif
res_select = select(nfds + 1, &readfds, NULL, NULL, timeout);
if (res_select < 0)
{
@@ -442,8 +552,12 @@ retry:
if (res_select == 0)
{
/* Handle timeout */
- elog(WARNING, "timeout while waiting for response");
- return ERROR_OCCURED;
+ elog(DEBUG1, "timeout while waiting for response");
+#ifdef XCP
+ for (i = 0; i < conn_count; i++)
+ connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL;
+#endif
+ return NO_ERROR_OCCURED;
}
/* read data */
@@ -553,8 +667,11 @@ retry:
if (nread < 0)
{
+#ifndef XCP
+ /* too noisy */
if (close_if_error)
elog(DEBUG1, "dnrd errno = %d", errno);
+#endif
if (errno == EINTR)
goto retry;
/* Some systems return EAGAIN/EWOULDBLOCK for no data */
@@ -739,8 +856,25 @@ get_message(PGXCNodeHandle *conn, int *len, char **msg)
void
release_handles(void)
{
+#ifdef XCP
+ bool destroy = false;
+#endif
int i;
+#ifdef XCP
+ if (HandlesInvalidatePending)
+ {
+ DoInvalidateRemoteHandles();
+ return;
+ }
+
+ /* don't free connection if holding a cluster lock */
+ if (cluster_ex_lock_held)
+ {
+ return;
+ }
+#endif
+
if (datanode_count == 0 && coord_count == 0)
return;
@@ -755,13 +889,32 @@ release_handles(void)
if (handle->sock != NO_SOCKET)
{
+#ifdef XCP
+ /*
+ * Connections at this point should be completely inactive,
+ * otherwise abaandon them. We can not allow not cleaned up
+ * connection is returned to pool.
+ */
+ if (handle->state != DN_CONNECTION_STATE_IDLE ||
+ handle->transaction_status != 'I')
+ {
+ destroy = true;
+ elog(DEBUG1, "Connection to Datanode %d has unexpected state %d and will be dropped",
+ handle->nodeoid, handle->state);
+ }
+#else
if (handle->state != DN_CONNECTION_STATE_IDLE)
elog(DEBUG1, "Connection to Datanode %d has unexpected state %d and will be dropped",
handle->nodeoid, handle->state);
+#endif
pgxc_node_free(handle);
}
}
+#ifdef XCP
+ if (IS_PGXC_COORDINATOR)
+ {
+#endif
/* Collect Coordinator handles */
for (i = 0; i < NumCoords; i++)
{
@@ -769,20 +922,43 @@ release_handles(void)
if (handle->sock != NO_SOCKET)
{
+#ifdef XCP
+ /*
+ * Connections at this point should be completely inactive,
+ * otherwise abaandon them. We can not allow not cleaned up
+ * connection is returned to pool.
+ */
+ if (handle->state != DN_CONNECTION_STATE_IDLE ||
+ handle->transaction_status != 'I')
+ {
+ destroy = true;
+ elog(DEBUG1, "Connection to Coordinator %d has unexpected state %d and will be dropped",
+ handle->nodeoid, handle->state);
+ }
+#else
if (handle->state != DN_CONNECTION_STATE_IDLE)
elog(DEBUG1, "Connection to Coordinator %d has unexpected state %d and will be dropped",
handle->nodeoid, handle->state);
+#endif
pgxc_node_free(handle);
}
}
+#ifdef XCP
+ }
+#endif
/* And finally release all the connections on pooler */
+#ifdef XCP
+ PoolManagerReleaseConnections(destroy);
+#else
PoolManagerReleaseConnections();
+#endif
datanode_count = 0;
coord_count = 0;
}
+#ifndef XCP
/*
* cancel a running query due to error while processing rows
*/
@@ -790,7 +966,7 @@ void
cancel_query(void)
{
int i;
- int dn_cancel[NumDataNodes];
+ int dn_cancel[NumDataNodes];
int co_cancel[NumCoords];
int dn_count = 0;
int co_count = 0;
@@ -912,6 +1088,7 @@ clear_all_data(void)
handle->error = NULL;
}
}
+#endif
/*
* Ensure specified amount of data can fit to the incoming buffer and
@@ -1202,6 +1379,87 @@ pgxc_node_send_parse(PGXCNodeHandle * handle, const char* statement,
}
+#ifdef XCP
+/*
+ * Send PLAN message down to the Data node
+ */
+int
+pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
+ const char *query, const char *planstr,
+ short num_params, Oid *param_types)
+{
+ int stmtLen;
+ int queryLen;
+ int planLen;
+ int paramTypeLen;
+ int msgLen;
+ char **paramTypes = (char **)palloc(sizeof(char *) * num_params);
+ int i;
+
+ /* Invalid connection state, return error */
+ if (handle->state != DN_CONNECTION_STATE_IDLE)
+ return EOF;
+
+ /* statement name size (do not allow NULL) */
+ stmtLen = strlen(statement) + 1;
+ /* source query size (do not allow NULL) */
+ queryLen = strlen(query) + 1;
+ /* query plan size (do not allow NULL) */
+ planLen = strlen(planstr) + 1;
+ /* 2 bytes for number of parameters, preceding the type names */
+ paramTypeLen = 2;
+ /* find names of the types of parameters */
+ for (i = 0; i < num_params; i++)
+ {
+ paramTypes[i] = format_type_be(param_types[i]);
+ paramTypeLen += strlen(paramTypes[i]) + 1;
+ }
+ /* size + pnameLen + queryLen + parameters */
+ msgLen = 4 + queryLen + stmtLen + planLen + paramTypeLen;
+
+ /* msgType + msgLen */
+ if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0)
+ {
+ add_error_message(handle, "out of memory");
+ return EOF;
+ }
+
+ handle->outBuffer[handle->outEnd++] = 'p';
+ /* size */
+ msgLen = htonl(msgLen);
+ memcpy(handle->outBuffer + handle->outEnd, &msgLen, 4);
+ handle->outEnd += 4;
+ /* statement name */
+ memcpy(handle->outBuffer + handle->outEnd, statement, stmtLen);
+ handle->outEnd += stmtLen;
+ /* source query */
+ memcpy(handle->outBuffer + handle->outEnd, query, queryLen);
+ handle->outEnd += queryLen;
+ /* query plan */
+ memcpy(handle->outBuffer + handle->outEnd, planstr, planLen);
+ handle->outEnd += planLen;
+ /* parameter types */
+ *((short *)(handle->outBuffer + handle->outEnd)) = htons(num_params);
+ handle->outEnd += sizeof(num_params);
+ /*
+ * instead of parameter ids we should send parameter names (qualified by
+ * schema name if required). The OIDs of types can be different on
+ * datanodes.
+ */
+ for (i = 0; i < num_params; i++)
+ {
+ int plen = strlen(paramTypes[i]) + 1;
+ memcpy(handle->outBuffer + handle->outEnd, paramTypes[i], plen);
+ handle->outEnd += plen;
+ pfree(paramTypes[i]);
+ }
+ pfree(paramTypes);
+
+ return 0;
+}
+#endif
+
+
/*
* Send BIND message down to the Datanode
*/
@@ -1366,8 +1624,6 @@ pgxc_node_send_close(PGXCNodeHandle * handle, bool is_statement,
else
handle->outBuffer[handle->outEnd++] = '\0';
- handle->state = DN_CONNECTION_STATE_QUERY;
-
return 0;
}
@@ -1468,7 +1724,7 @@ pgxc_node_send_sync(PGXCNodeHandle * handle)
/*
- * Send the GXID down to the Datanode
+ * Send series of Extended Query protocol messages to the data node
*/
int
pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
@@ -1489,12 +1745,18 @@ pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *query,
if (fetch_size >= 0)
if (pgxc_node_send_execute(handle, portal, fetch_size))
return EOF;
+#ifdef XCP
+ if (pgxc_node_send_flush(handle))
+ return EOF;
+#else
if (pgxc_node_send_sync(handle))
return EOF;
+#endif
return 0;
}
+
/*
* This method won't return until connection buffer is empty or error occurs
* To ensure all data are on the wire before waiting for response
@@ -1526,6 +1788,13 @@ pgxc_node_flush_read(PGXCNodeHandle *handle)
if (handle == NULL)
return;
+#ifdef XCP
+ /*
+ * Before reading input send Sync to make sure
+ * we will eventually receive ReadyForQuery
+ */
+ pgxc_node_send_sync(handle);
+#endif
while(true)
{
read_result = pgxc_node_read_data(handle, false);
@@ -1752,6 +2021,9 @@ pgxc_node_send_timestamp(PGXCNodeHandle *handle, TimestampTz timestamp)
void
add_error_message(PGXCNodeHandle *handle, const char *message)
{
+#ifdef XCP
+ elog(LOG, "Connection error %s", message);
+#endif
handle->transaction_status = 'E';
if (handle->error)
{
@@ -1761,6 +2033,102 @@ add_error_message(PGXCNodeHandle *handle, const char *message)
handle->error = pstrdup(message);
}
+
+#ifdef XCP
+static int load_balancer = 0;
+/*
+ * Get one of the specified nodes to query replicated data source.
+ * If session already owns one or more of the requested connection,
+ * the function returns existing one to avoid contacting pooler.
+ * Performs basic load balancing.
+ */
+PGXCNodeHandle *
+get_any_handle(List *datanodelist)
+{
+ ListCell *lc1;
+ int i, node;
+
+ /* sanity check */
+ Assert(list_length(datanodelist) > 0);
+
+ if (HandlesInvalidatePending)
+ if (DoInvalidateRemoteHandles())
+ ereport(ERROR,
+ (errcode(ERRCODE_QUERY_CANCELED),
+ errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+
+ /* loop through local datanode handles */
+ for (i = 0, node = load_balancer; i < NumDataNodes; i++, node++)
+ {
+ /* At the moment node is an index in the array, and we may need to wrap it */
+ if (node >= NumDataNodes)
+ node -= NumDataNodes;
+ /* See if handle is already used */
+ if (dn_handles[node].sock != NO_SOCKET)
+ {
+ foreach(lc1, datanodelist)
+ {
+ if (lfirst_int(lc1) == node)
+ {
+ /*
+ * The node is in the list of requested nodes,
+ * set load_balancer for next time and return the handle
+ */
+ load_balancer = node + 1;
+ return &dn_handles[node];
+ }
+ }
+ }
+ }
+
+ /*
+ * None of requested nodes is in use, need to get one from the pool.
+ * Choose one.
+ */
+ for (i = 0, node = load_balancer; i < NumDataNodes; i++, node++)
+ {
+ /* At the moment node is an index in the array, and we may need to wrap it */
+ if (node >= NumDataNodes)
+ node -= NumDataNodes;
+ /* Look only at empty slots, we have already checked existing handles */
+ if (dn_handles[node].sock == NO_SOCKET)
+ {
+ foreach(lc1, datanodelist)
+ {
+ if (lfirst_int(lc1) == node)
+ {
+ /* The node is requested */
+ List *allocate = list_make1_int(node);
+ int *fds = PoolManagerGetConnections(allocate, NIL);
+
+ if (!fds)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("Failed to get pooled connections")));
+ }
+
+ pgxc_node_init(&dn_handles[node], fds[0]);
+ datanode_count++;
+
+ /*
+ * set load_balancer for next time and return the handle
+ */
+ load_balancer = node + 1;
+ return &dn_handles[node];
+ }
+ }
+ }
+ }
+
+ /* We should not get here, one of the cases should be met */
+ Assert(false);
+ /* Keep compiler quiet */
+ return NULL;
+}
+#endif
+
+
/*
* for specified list return array of PGXCNodeHandles
* acquire from pool if needed.
@@ -1782,6 +2150,14 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query)
/* index of the result array */
int i = 0;
+#ifdef XCP
+ if (HandlesInvalidatePending)
+ if (DoInvalidateRemoteHandles())
+ ereport(ERROR,
+ (errcode(ERRCODE_QUERY_CANCELED),
+ errmsg("canceling transaction due to cluster configuration reset by administrator command")));
+#endif
+
result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
if (!result)
{
@@ -2010,6 +2386,64 @@ get_handles(List *datanodelist, List *coordlist, bool is_coord_only_query)
return result;
}
+
+#ifdef XCP
+PGXCNodeAllHandles *
+get_current_handles(void)
+{
+ PGXCNodeAllHandles *result;
+ PGXCNodeHandle *node_handle;
+ int i;
+
+ result = (PGXCNodeAllHandles *) palloc(sizeof(PGXCNodeAllHandles));
+ if (!result)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ result->primary_handle = NULL;
+ result->co_conn_count = 0;
+ result->dn_conn_count = 0;
+
+ result->datanode_handles = (PGXCNodeHandle **)
+ palloc(NumDataNodes * sizeof(PGXCNodeHandle *));
+ if (!result->datanode_handles)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ node_handle = &dn_handles[i];
+ if (node_handle->sock != NO_SOCKET)
+ result->datanode_handles[result->dn_conn_count++] = node_handle;
+ }
+
+ result->coord_handles = (PGXCNodeHandle **)
+ palloc(NumCoords * sizeof(PGXCNodeHandle *));
+ if (!result->coord_handles)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ for (i = 0; i < NumCoords; i++)
+ {
+ node_handle = &co_handles[i];
+ if (node_handle->sock != NO_SOCKET)
+ result->coord_handles[result->co_conn_count++] = node_handle;
+ }
+
+ return result;
+}
+#endif
+
+
/* Free PGXCNodeAllHandles structure */
void
pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
@@ -2027,6 +2461,52 @@ pfree_pgxc_all_handles(PGXCNodeAllHandles *pgxc_handles)
pfree(pgxc_handles);
}
+#ifdef XCP
+/*
+ * PGXCNode_getNodeId
+ * Look at the data cached for handles and return node position
+ * If node type is PGXC_NODE_COORDINATOR look only in coordinator list,
+ * if node type is PGXC_NODE_DATANODE look only in datanode list,
+ * if other (assume PGXC_NODE_NODE) search both, in last case return actual
+ * node type.
+ */
+int
+PGXCNodeGetNodeId(Oid nodeoid, char *node_type)
+{
+ int i;
+
+ /* First check datanodes, they referenced more often */
+ if (node_type == NULL || *node_type != PGXC_NODE_COORDINATOR)
+ {
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ if (dn_handles[i].nodeoid == nodeoid)
+ {
+ if (node_type)
+ *node_type = PGXC_NODE_DATANODE;
+ return i;
+ }
+ }
+ }
+ /* Then check coordinators */
+ if (node_type == NULL || *node_type != PGXC_NODE_DATANODE)
+ {
+ for (i = 0; i < NumCoords; i++)
+ {
+ if (co_handles[i].nodeoid == nodeoid)
+ {
+ if (node_type)
+ *node_type = PGXC_NODE_COORDINATOR;
+ return i;
+ }
+ }
+ }
+ /* Not found, have caller handling it */
+ if (node_type)
+ *node_type = PGXC_NODE_NONE;
+ return -1;
+}
+#else
/*
* PGXCNode_getNodeId
* Look at the data cached for handles and return node position
@@ -2065,6 +2545,7 @@ PGXCNodeGetNodeId(Oid nodeoid, char node_type)
}
return res;
}
+#endif
/*
* PGXCNode_getNodeOid
@@ -2108,20 +2589,354 @@ pgxc_node_str(PG_FUNCTION_ARGS)
* Return node position in handles array
*/
int
+#ifdef XCP
+PGXCNodeGetNodeIdFromName(char *node_name, char *node_type)
+#else
PGXCNodeGetNodeIdFromName(char *node_name, char node_type)
+#endif
{
char *nm;
Oid nodeoid;
if (node_name == NULL)
+#ifdef XCP
+ {
+ if (node_type)
+ *node_type = PGXC_NODE_NONE;
+ return -1;
+ }
+#else
return -1;
+#endif
nm = str_tolower(node_name, strlen(node_name), DEFAULT_COLLATION_OID);
nodeoid = get_pgxc_nodeoid(nm);
pfree(nm);
if (!OidIsValid(nodeoid))
+#ifdef XCP
+ {
+ if (node_type)
+ *node_type = PGXC_NODE_NONE;
return -1;
+ }
+#else
+ return -1;
+#endif
return PGXCNodeGetNodeId(nodeoid, node_type);
}
+
+
+#ifdef XCP
+/*
+ * Remember new value of a session or transaction parameter, and set same
+ * values on newly connected remote nodes.
+ */
+void
+PGXCNodeSetParam(bool local, const char *name, const char *value)
+{
+ HTAB *table;
+
+ /* Get the target hash table and invalidate command string */
+ if (local)
+ {
+ table = local_param_htab;
+ if (local_params)
+ resetStringInfo(local_params);
+ }
+ else
+ {
+ table = session_param_htab;
+ if (session_params)
+ resetStringInfo(session_params);
+ }
+
+ /* Initialize table if empty */
+ if (table == NULL)
+ {
+ HASHCTL hinfo;
+ int hflags;
+
+ /* do not bother creating hash table if we about to reset non-existing
+ * parameter */
+ if (value == NULL)
+ return;
+
+ /* Init parameter hashtable */
+ MemSet(&hinfo, 0, sizeof(hinfo));
+ hflags = 0;
+
+ hinfo.keysize = NAMEDATALEN;
+ hinfo.entrysize = sizeof(ParamEntry);
+ hflags |= HASH_ELEM;
+
+ if (local)
+ {
+ /* Local parameters are not valid beyond transaction boundaries */
+ hinfo.hcxt = TopTransactionContext;
+ hflags |= HASH_CONTEXT;
+ table = hash_create("Remote local params", 16, &hinfo, hflags);
+ local_param_htab = table;
+ }
+ else
+ {
+ /*
+ * Session parameters needs to be in TopMemoryContext, hash table
+ * is created in TopMemoryContext by default.
+ */
+ table = hash_create("Remote session params", 16, &hinfo, hflags);
+ session_param_htab = table;
+ }
+ }
+
+ if (value)
+ {
+ ParamEntry *entry;
+ /* create entry or replace value for the parameter */
+ entry = (ParamEntry *) hash_search(table, name, HASH_ENTER, NULL);
+ strlcpy((char *) (&entry->value), value, NAMEDATALEN);
+ }
+ else
+ {
+ /* remove entry */
+ hash_search(table, name, HASH_REMOVE, NULL);
+ /* remove table if it becomes empty */
+ if (hash_get_num_entries(table) == 0)
+ {
+ hash_destroy(table);
+ if (local)
+ local_param_htab = NULL;
+ else
+ session_param_htab = NULL;
+ }
+ }
+}
+
+
+/*
+ * Forget all parameter values set either for transaction or both transaction
+ * and session.
+ */
+void
+PGXCNodeResetParams(bool only_local)
+{
+ if (!only_local && session_param_htab)
+ {
+ /* need to explicitly pfree session stuff, it is in TopMemoryContext */
+ hash_destroy(session_param_htab);
+ session_param_htab = NULL;
+ if (session_params)
+ {
+ pfree(session_params->data);
+ pfree(session_params);
+ session_params = NULL;
+ }
+ }
+ /*
+ * no need to explicitly destroy the local_param_htab and local_params,
+ * it will gone with the transaction memory context.
+ */
+ local_param_htab = NULL;
+ local_params = NULL;
+}
+
+
+static char *
+quote_ident_cstr(char *rawstr)
+{
+ text *rawstr_text;
+ text *result_text;
+ char *result;
+
+ rawstr_text = cstring_to_text(rawstr);
+ result_text = DatumGetTextP(DirectFunctionCall1(quote_ident,
+ PointerGetDatum(rawstr_text)));
+ result = text_to_cstring(result_text);
+
+ return result;
+}
+
+static void
+get_set_command(HTAB *table, StringInfo command, bool local)
+{
+ HASH_SEQ_STATUS hseq_status;
+ ParamEntry *entry;
+
+ if (table == NULL)
+ return;
+
+ hash_seq_init(&hseq_status, table);
+ while ((entry = (ParamEntry *) hash_seq_search(&hseq_status)))
+ {
+ char *value = NameStr(entry->value);
+
+ if (strlen(value) == 0)
+ value = "''";
+
+ appendStringInfo(command, "SET %s %s TO %s;", local ? "LOCAL" : "",
+ NameStr(entry->name), value);
+ }
+}
+
+
+/*
+ * Returns SET commands needed to initialize remote session.
+ * The command may already be biult and valid, return it right away if the case.
+ * Otherwise build it up.
+ * To support Distributed Session machinery coordinator should generate and
+ * send a distributed session identifier to remote nodes. Generate it here.
+ */
+char *
+PGXCNodeGetSessionParamStr(void)
+{
+ /*
+ * If no session parameters are set and that is a coordinator we need to set
+ * global_session anyway, even if there were no other parameters.
+ * We do not want this string to disappear, so create it in the
+ * TopMemoryContext. However if we add first session parameter we will need
+ * to free the buffer and recreate it in the same context as the hash table
+ * to avoid memory leakage.
+ */
+ if (session_params == NULL)
+ {
+ MemoryContext oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+ session_params = makeStringInfo();
+ MemoryContextSwitchTo(oldcontext);
+ }
+
+ /* If the paramstr invalid build it up */
+ if (session_params->len == 0)
+ {
+ if (IS_PGXC_COORDINATOR)
+ appendStringInfo(session_params, "SET global_session TO %s_%d;",
+ PGXCNodeName, MyProcPid);
+ get_set_command(session_param_htab, session_params, false);
+ }
+ return session_params->len == 0 ? NULL : session_params->data;
+}
+
+
+/*
+ * Returns SET commands needed to initialize transaction on a remote session.
+ * The command may already be biult and valid, return it right away if the case.
+ * Otherwise build it up.
+ */
+char *
+PGXCNodeGetTransactionParamStr(void)
+{
+ /* If no local parameters defined there is nothing to return */
+ if (local_param_htab == NULL)
+ return NULL;
+
+ /*
+ * If the paramstr invalid build it up.
+ */
+ if (local_params == NULL)
+ {
+ MemoryContext oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+ local_params = makeStringInfo();
+ MemoryContextSwitchTo(oldcontext);
+ }
+ /*
+ * If parameter string exists it is valid, it is truncated when parameters
+ * are modified.
+ */
+ if (local_params->len == 0)
+ {
+ get_set_command(local_param_htab, local_params, true);
+ }
+ return local_params->len == 0 ? NULL : local_params->data;
+}
+
+
+/*
+ * Send down specified query, read and discard all responses until ReadyForQuery
+ */
+void
+pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query)
+{
+ pgxc_node_send_query(handle, set_query);
+ /*
+ * Now read responses until ReadyForQuery.
+ * XXX We may need to handle possible errors here.
+ */
+ for (;;)
+ {
+ char msgtype;
+ int msglen;
+ char *msg;
+ /*
+ * If we are in the process of shutting down, we
+ * may be rolling back, and the buffer may contain other messages.
+ * We want to avoid a procarray exception
+ * as well as an error stack overflow.
+ */
+ if (proc_exit_inprogress)
+ handle->state = DN_CONNECTION_STATE_ERROR_FATAL;
+
+ /* don't read from from the connection if there is a fatal error */
+ if (handle->state == DN_CONNECTION_STATE_ERROR_FATAL)
+ break;
+
+ /* No data available, read more */
+ if (!HAS_MESSAGE_BUFFERED(handle))
+ {
+ pgxc_node_receive(1, &handle, NULL);
+ continue;
+ }
+ msgtype = get_message(handle, &msglen, &msg);
+ /*
+ * Ignore any response except ReadyForQuery, it allows to go on.
+ */
+ if (msgtype == 'Z') /* ReadyForQuery */
+ {
+ handle->transaction_status = msg[0];
+ handle->state = DN_CONNECTION_STATE_IDLE;
+ handle->combiner = NULL;
+ break;
+ }
+ }
+}
+
+
+void
+RequestInvalidateRemoteHandles(void)
+{
+ HandlesInvalidatePending = true;
+}
+
+
+/*
+ * For all handles, mark as they are not in use and discard pending input/output
+ */
+static bool
+DoInvalidateRemoteHandles(void)
+{
+ int i;
+ PGXCNodeHandle *handle;
+ bool result = false;
+
+ HandlesInvalidatePending = false;
+
+ for (i = 0; i < NumCoords; i++)
+ {
+ handle = &co_handles[i];
+ if (handle->sock != NO_SOCKET)
+ result = true;
+ handle->sock = NO_SOCKET;
+ handle->inStart = handle->inEnd = handle->inCursor = 0;
+ handle->outEnd = 0;
+ }
+ for (i = 0; i < NumDataNodes; i++)
+ {
+ handle = &dn_handles[i];
+ if (handle->sock != NO_SOCKET)
+ result = true;
+ handle->sock = NO_SOCKET;
+ handle->inStart = handle->inEnd = handle->inCursor = 0;
+ handle->outEnd = 0;
+ }
+ return result;
+}
+#endif
diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c
index 8f038c6abc..be2c387fad 100644
--- a/src/backend/pgxc/pool/poolmgr.c
+++ b/src/backend/pgxc/pool/poolmgr.c
@@ -24,6 +24,11 @@
* allocated to a session, at most one per Datanode.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
@@ -61,9 +66,18 @@
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
+#ifdef XCP
+#include "pgxc/pause.h"
+#include "storage/procarray.h"
+#endif
/* Configuration options */
+#ifdef XCP
+int PoolConnKeepAlive = 600;
+int PoolMaintenanceTimeout = 30;
+#else
int MinPoolSize = 1;
+#endif
int MaxPoolSize = 100;
int PoolerPort = 6667;
@@ -80,6 +94,15 @@ typedef struct
int port;
} PGXCNodeConnectionInfo;
+#ifdef XCP
+/* Handle to the pool manager (Session's side) */
+typedef struct
+{
+ /* communication channel */
+ PoolPort port;
+} PoolHandle;
+#endif
+
/* The root memory context */
static MemoryContext PoolerMemoryContext = NULL;
/*
@@ -105,11 +128,17 @@ static int is_pool_locked = false;
static int server_fd = -1;
static int node_info_check(PoolAgent *agent);
+#ifdef XCP
+static void agent_init(PoolAgent *agent, const char *database,
+ const char *user_name);
+#else
static void agent_init(PoolAgent *agent, const char *database, const char *user_name,
const char *pgoptions);
+#endif
static void agent_destroy(PoolAgent *agent);
static void agent_create(void);
static void agent_handle_input(PoolAgent *agent, StringInfo s);
+#ifndef XCP
static int agent_session_command(PoolAgent *agent,
const char *set_command,
PoolCommandType command_type);
@@ -117,18 +146,33 @@ static int agent_set_command(PoolAgent *agent,
const char *set_command,
PoolCommandType command_type);
static int agent_temp_command(PoolAgent *agent);
+#endif
+#ifdef XCP
+static DatabasePool *create_database_pool(const char *database,
+ const char *user_name);
+#else
static DatabasePool *create_database_pool(const char *database, const char *user_name, const char *pgoptions);
+#endif
static void insert_database_pool(DatabasePool *pool);
static int destroy_database_pool(const char *database, const char *user_name);
static void reload_database_pools(PoolAgent *agent);
+#ifdef XCP
+static DatabasePool *find_database_pool(const char *database,
+ const char *user_name);
+#else
static DatabasePool *find_database_pool(const char *database, const char *user_name, const char *pgoptions);
+#endif
static DatabasePool *remove_database_pool(const char *database, const char *user_name);
static int *agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist);
+#ifndef XCP
static int send_local_commands(PoolAgent *agent, List *datanodelist, List *coordlist);
+#endif
static int cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist);
static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, Oid node);
static void agent_release_connections(PoolAgent *agent, bool force_destroy);
+#ifndef XCP
static void agent_reset_session(PoolAgent *agent);
+#endif
static void release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
Oid node, bool force_destroy);
static void destroy_slot(PGXCNodePoolSlot *slot);
@@ -143,14 +187,21 @@ static int *abort_pids(int *count,
const char *database,
const char *user_name);
static char *build_node_conn_str(Oid node, DatabasePool *dbPool);
-
/* Signal handlers */
static void pooler_die(SIGNAL_ARGS);
static void pooler_quickdie(SIGNAL_ARGS);
-
+#ifdef XCP
+static void PoolManagerConnect(const char *database, const char *user_name);
+static void pooler_sighup(SIGNAL_ARGS);
+static bool shrink_pool(DatabasePool *pool);
+static void pools_maintenance(void);
+#endif
/*
* Flags set by interrupt handlers for later service in the main loop.
*/
+#ifdef XCP
+static volatile sig_atomic_t got_SIGHUP = false;
+#endif
static volatile sig_atomic_t shutdown_requested = false;
void
@@ -208,7 +259,11 @@ PoolManagerInit()
pqsignal(SIGINT, pooler_die);
pqsignal(SIGTERM, pooler_die);
pqsignal(SIGQUIT, pooler_quickdie);
+#ifdef XCP
+ pqsignal(SIGHUP, pooler_sighup);
+#else
pqsignal(SIGHUP, SIG_IGN);
+#endif
/* TODO other signal handlers */
/* We allow SIGQUIT (quickdie) at all times */
@@ -331,17 +386,30 @@ PoolManagerDestroy(void)
}
+#ifdef XCP
+/*
+ * Connect to the pooler process
+ */
+static void
+#else
/*
* Get handle to pool manager
* Invoked from Postmaster's main loop just before forking off new session
* Returned PoolHandle structure will be inherited by session process
*/
PoolHandle *
+#endif
GetPoolManagerHandle(void)
{
PoolHandle *handle;
int fdsock;
+#ifdef XCP
+ if (poolHandle)
+ /* already connected */
+ return;
+#endif
+
/* Connect to the pooler */
fdsock = pool_connect(PoolerPort, UnixSocketDir);
if (fdsock < 0)
@@ -352,7 +420,9 @@ GetPoolManagerHandle(void)
(errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("failed to connect to pool manager: %m")));
errno = saved_errno;
+#ifndef XCP
return NULL;
+#endif
}
/* Allocate handle */
@@ -369,7 +439,9 @@ GetPoolManagerHandle(void)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
+#ifndef XCP
return NULL;
+#endif
}
handle->port.fdsock = fdsock;
@@ -377,12 +449,17 @@ GetPoolManagerHandle(void)
handle->port.RecvPointer = 0;
handle->port.SendPointer = 0;
+#ifdef XCP
+ poolHandle = handle;
+#else
return handle;
+#endif
}
+#ifndef XCP
/*
- * Close handle
+ * XXX May create on_proc_exit callback instead
*/
void
PoolManagerCloseHandle(PoolHandle *handle)
@@ -391,7 +468,7 @@ PoolManagerCloseHandle(PoolHandle *handle)
free(handle);
handle = NULL;
}
-
+#endif
/*
* Create agent
@@ -444,9 +521,11 @@ agent_create(void)
agent->coord_conn_oids = NULL;
agent->dn_connections = NULL;
agent->coord_connections = NULL;
+#ifndef XCP
agent->session_params = NULL;
agent->local_params = NULL;
agent->is_temp = false;
+#endif
agent->pid = 0;
/* Append new agent to the list */
@@ -455,6 +534,8 @@ agent_create(void)
MemoryContextSwitchTo(oldcontext);
}
+
+#ifndef XCP
/*
* session_options
* Returns the pgoptions string generated using a particular
@@ -508,11 +589,86 @@ char *session_options(void)
return options.data;
}
+#endif
+
/*
* Associate session with specified database and respective connection pool
* Invoked from Session process
*/
+#ifdef XCP
+static void
+PoolManagerConnect(const char *database, const char *user_name)
+{
+ int n32;
+ char msgtype = 'c';
+ int unamelen = strlen(user_name);
+ int dbnamelen = strlen(database);
+ char atchar = ' ';
+
+ /* Connect to the pooler process if not yet connected */
+ GetPoolManagerHandle();
+ if (poolHandle == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("failed to connect to the pooler process")));
+
+ /*
+ * Special handling for db_user_namespace=on
+ * We need to handle per-db users and global users. The per-db users will
+ * arrive with @dbname and global users just as username. Handle both of
+ * them appropriately
+ */
+ if (strcmp(GetConfigOption("db_user_namespace", false, false), "on") == 0)
+ {
+ if (strchr(user_name, '@') != NULL)
+ {
+ Assert(unamelen > dbnamelen + 1);
+ unamelen -= (dbnamelen + 1);
+ }
+ else
+ {
+ atchar = '@';
+ unamelen++;
+ }
+ }
+
+ /* Message type */
+ pool_putbytes(&poolHandle->port, &msgtype, 1);
+
+ /* Message length */
+ n32 = htonl(dbnamelen + unamelen + 18);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* PID number */
+ n32 = htonl(MyProcPid);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Length of Database string */
+ n32 = htonl(dbnamelen + 1);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Send database name followed by \0 terminator */
+ pool_putbytes(&poolHandle->port, database, dbnamelen);
+ pool_putbytes(&poolHandle->port, "\0", 1);
+
+ /* Length of user name string */
+ n32 = htonl(unamelen + 1);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Send user name followed by \0 terminator */
+ /* Send the '@' char if needed. Already accounted for in len */
+ if (atchar == '@')
+ {
+ pool_putbytes(&poolHandle->port, user_name, unamelen - 1);
+ pool_putbytes(&poolHandle->port, "@", 1);
+ }
+ else
+ pool_putbytes(&poolHandle->port, user_name, unamelen);
+ pool_putbytes(&poolHandle->port, "\0", 1);
+ pool_flush(&poolHandle->port);
+}
+#else
void
PoolManagerConnect(PoolHandle *handle,
const char *database, const char *user_name,
@@ -564,6 +720,7 @@ PoolManagerConnect(PoolHandle *handle,
pool_flush(&handle->port);
}
+#endif
/*
* Reconnect to pool manager
@@ -572,6 +729,13 @@ PoolManagerConnect(PoolHandle *handle,
void
PoolManagerReconnect(void)
{
+#ifdef XCP
+ /* Connected, disconnect */
+ if (poolHandle)
+ PoolManagerDisconnect();
+
+ PoolManagerConnect(get_database_name(MyDatabaseId), GetClusterUserName());
+#else
PoolHandle *handle;
Assert(poolHandle);
@@ -582,8 +746,11 @@ PoolManagerReconnect(void)
get_database_name(MyDatabaseId),
GetUserNameFromId(GetUserId()),
session_options());
+#endif
}
+
+#ifndef XCP
int
PoolManagerSetCommand(PoolCommandType command_type, const char *set_command)
{
@@ -694,6 +861,7 @@ PoolManagerSendLocalCommand(int dn_count, int* dn_list, int co_count, int* co_li
/* Get result */
return pool_recvres(&poolHandle->port);
}
+#endif
/*
* Lock/unlock pool manager
@@ -706,7 +874,13 @@ PoolManagerLock(bool is_lock)
char msgtype = 'o';
int n32;
int msglen = 8;
+#ifdef XCP
+ if (poolHandle == NULL)
+ PoolManagerConnect(get_database_name(MyDatabaseId),
+ GetClusterUserName());
+#else
Assert(poolHandle);
+#endif
/* Message type */
pool_putbytes(&poolHandle->port, &msgtype, 1);
@@ -724,9 +898,15 @@ PoolManagerLock(bool is_lock)
/*
* Init PoolAgent
*/
+#ifdef XCP
+static void
+agent_init(PoolAgent *agent, const char *database,
+ const char *user_name)
+#else
static void
agent_init(PoolAgent *agent, const char *database, const char *user_name,
const char *pgoptions)
+#endif
{
MemoryContext oldcontext;
@@ -748,12 +928,21 @@ agent_init(PoolAgent *agent, const char *database, const char *user_name,
palloc0(agent->num_coord_connections * sizeof(PGXCNodePoolSlot *));
agent->dn_connections = (PGXCNodePoolSlot **)
palloc0(agent->num_dn_connections * sizeof(PGXCNodePoolSlot *));
+#ifdef XCP
+ /* find database */
+ agent->pool = find_database_pool(database, user_name);
+
+ /* create if not found */
+ if (agent->pool == NULL)
+ agent->pool = create_database_pool(database, user_name);
+#else
/* find database */
agent->pool = find_database_pool(database, user_name, pgoptions);
/* create if not found */
if (agent->pool == NULL)
agent->pool = create_database_pool(database, user_name, pgoptions);
+#endif
MemoryContextSwitchTo(oldcontext);
@@ -775,6 +964,13 @@ agent_destroy(PoolAgent *agent)
/* Discard connections if any remaining */
if (agent->pool)
{
+#ifdef XCP
+ /*
+ * If session is disconnecting while there are active connections
+ * we can not know if they clean or not, so force destroy them
+ */
+ agent_release_connections(agent, true);
+#else
/*
* Agent is being destroyed, so reset session parameters
* before putting back connections to pool.
@@ -786,6 +982,7 @@ agent_destroy(PoolAgent *agent)
* Force disconnection if there are temporary objects on agent.
*/
agent_release_connections(agent, agent->is_temp);
+#endif
}
/* find agent in the list */
@@ -813,12 +1010,20 @@ agent_destroy(PoolAgent *agent)
void
PoolManagerDisconnect(void)
{
+#ifdef XCP
+ if (!poolHandle)
+ return; /* not even connected */
+#else
Assert(poolHandle);
+#endif
pool_putmessage(&poolHandle->port, 'd', NULL, 0);
pool_flush(&poolHandle->port);
close(Socket(poolHandle->port));
+#ifdef XCP
+ free(poolHandle);
+#endif
poolHandle = NULL;
}
@@ -835,7 +1040,13 @@ PoolManagerGetConnections(List *datanodelist, List *coordlist)
int totlen = list_length(datanodelist) + list_length(coordlist);
int nodes[totlen + 2];
+#ifdef XCP
+ if (poolHandle == NULL)
+ PoolManagerConnect(get_database_name(MyDatabaseId),
+ GetClusterUserName());
+#else
Assert(poolHandle);
+#endif
/*
* Prepare end send message to pool manager.
@@ -895,7 +1106,17 @@ PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids)
int dblen = dbname ? strlen(dbname) + 1 : 0;
int userlen = username ? strlen(username) + 1 : 0;
+#ifdef XCP
+ /*
+ * New connection may be established to clean connections to
+ * specified nodes and databases.
+ */
+ if (poolHandle == NULL)
+ PoolManagerConnect(get_database_name(MyDatabaseId),
+ GetClusterUserName());
+#else
Assert(poolHandle);
+#endif
/* Message type */
pool_putbytes(&poolHandle->port, &msgtype, 1);
@@ -944,6 +1165,16 @@ PoolManagerCleanConnection(List *datanodelist, List *coordlist, char *dbname, ch
int userlen = username ? strlen(username) + 1 : 0;
int dblen = dbname ? strlen(dbname) + 1 : 0;
+#ifdef XCP
+ /*
+ * New connection may be established to clean connections to
+ * specified nodes and databases.
+ */
+ if (poolHandle == NULL)
+ PoolManagerConnect(get_database_name(MyDatabaseId),
+ GetClusterUserName());
+#endif
+
nodes[0] = htonl(list_length(datanodelist));
i = 1;
if (list_length(datanodelist) != 0)
@@ -1008,7 +1239,17 @@ PoolManagerCheckConnectionInfo(void)
{
int res;
+#ifdef XCP
+ /*
+ * New connection may be established to clean connections to
+ * specified nodes and databases.
+ */
+ if (poolHandle == NULL)
+ PoolManagerConnect(get_database_name(MyDatabaseId),
+ GetClusterUserName());
+#else
Assert(poolHandle);
+#endif
PgxcNodeListAndCount();
pool_putmessage(&poolHandle->port, 'q', NULL, 0);
pool_flush(&poolHandle->port);
@@ -1051,9 +1292,10 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
{
const char *database = NULL;
const char *user_name = NULL;
+#ifndef XCP
const char *pgoptions = NULL;
- const char *set_command = NULL;
PoolCommandType command_type;
+#endif
int datanodecount;
int coordcount;
List *nodelist = NIL;
@@ -1073,6 +1315,8 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
if (is_pool_locked && (qtype == 'a' || qtype == 'c' || qtype == 'g'))
elog(WARNING,"Pool operation cannot run during pool lock");
+ elog(DEBUG1, "Pooler is handling command %c from %d", (char) qtype, agent->pid);
+
switch (qtype)
{
case 'a': /* ABORT */
@@ -1093,6 +1337,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
if (pids)
pfree(pids);
break;
+#ifndef XCP
case 'b': /* Fire transaction-block commands on given nodes */
/*
* Length of message is caused by:
@@ -1119,6 +1364,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
list_free(datanodelist);
list_free(coordlist);
break;
+#endif
case 'c': /* CONNECT */
pool_getmessage(&agent->port, s, 0);
agent->pid = pq_getmsgint(s, 4);
@@ -1126,13 +1372,19 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
database = pq_getmsgbytes(s, len);
len = pq_getmsgint(s, 4);
user_name = pq_getmsgbytes(s, len);
+#ifndef XCP
len = pq_getmsgint(s, 4);
pgoptions = pq_getmsgbytes(s, len);
+#endif
/*
* Coordinator pool is not initialized.
* With that it would be impossible to create a Database by default.
*/
+#ifdef XCP
+ agent_init(agent, database, user_name);
+#else
agent_init(agent, database, user_name, pgoptions);
+#endif
pq_getmsgend(s);
break;
case 'd': /* DISCONNECT */
@@ -1277,10 +1529,22 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
pool_sendres(&agent->port, res);
break;
case 'r': /* RELEASE CONNECTIONS */
+#ifdef XCP
+ {
+ bool destroy;
+
+ pool_getmessage(&agent->port, s, 8);
+ destroy = (bool) pq_getmsgint(s, 4);
+ pq_getmsgend(s);
+ agent_release_connections(agent, destroy);
+ }
+#else
pool_getmessage(&agent->port, s, 4);
pq_getmsgend(s);
agent_release_connections(agent, false);
+#endif
break;
+#ifndef XCP
case 's': /* Session-related COMMAND */
pool_getmessage(&agent->port, s, 0);
/* Determine if command is local or session */
@@ -1298,6 +1562,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
/* Send success result */
pool_sendres(&agent->port, res);
break;
+#endif
default: /* EOF or protocol violation */
agent_destroy(agent);
return;
@@ -1308,6 +1573,7 @@ agent_handle_input(PoolAgent * agent, StringInfo s)
}
}
+#ifndef XCP
/*
* Manage a session command for pooler
*/
@@ -1419,6 +1685,7 @@ agent_set_command(PoolAgent *agent, const char *set_command, PoolCommandType com
return res;
}
+#endif
/*
* acquire connection
@@ -1460,6 +1727,7 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist)
*/
oldcontext = MemoryContextSwitchTo(agent->pool->mcxt);
+
/* Initialize result */
i = 0;
/* Save in array fds of Datanodes first */
@@ -1489,8 +1757,10 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist)
* Local parameters are fired only once BEGIN has been launched on
* remote nodes.
*/
+#ifndef XCP
if (agent->session_params)
PGXCNodeSendSetQuery(slot->conn, agent->session_params);
+#endif
}
result[i++] = PQsocket((PGconn *) agent->dn_connections[node]->conn);
@@ -1522,8 +1792,10 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist)
* Local parameters are fired only once BEGIN has been launched on
* remote nodes.
*/
+#ifndef XCP
if (agent->session_params)
PGXCNodeSendSetQuery(slot->conn, agent->session_params);
+#endif
}
result[i++] = PQsocket((PGconn *) agent->coord_connections[node]->conn);
@@ -1534,6 +1806,7 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist)
return result;
}
+#ifndef XCP
/*
* send transaction local commands if any, set the begin sent status in any case
*/
@@ -1605,6 +1878,7 @@ send_local_commands(PoolAgent *agent, List *datanodelist, List *coordlist)
return -res;
return 0;
}
+#endif
/*
* Cancel query
@@ -1664,6 +1938,31 @@ cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlis
/*
* Return connections back to the pool
*/
+#ifdef XCP
+void
+PoolManagerReleaseConnections(bool force)
+{
+ char msgtype = 'r';
+ int n32;
+ int msglen = 8;
+
+ /* If disconnected from pooler all the connections already released */
+ if (!poolHandle)
+ return;
+
+ /* Message type */
+ pool_putbytes(&poolHandle->port, &msgtype, 1);
+
+ /* Message length */
+ n32 = htonl(msglen);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+
+ /* Lock information */
+ n32 = htonl((int) force);
+ pool_putbytes(&poolHandle->port, (char *) &n32, 4);
+ pool_flush(&poolHandle->port);
+}
+#else
void
PoolManagerReleaseConnections(void)
{
@@ -1671,6 +1970,8 @@ PoolManagerReleaseConnections(void)
pool_putmessage(&poolHandle->port, 'r', NULL, 0);
pool_flush(&poolHandle->port);
}
+#endif
+
/*
* Cancel Query
@@ -1736,7 +2037,15 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
if (!agent->dn_connections && !agent->coord_connections)
return;
+#ifdef XCP
+ if (!force_destroy && cluster_ex_lock_held)
+ {
+ elog(LOG, "Not releasing connection with cluster lock");
+ return;
+ }
+#endif
+#ifndef XCP
/*
* If there are some session parameters or temporary objects,
* do not put back connections to pool.
@@ -1751,6 +2060,7 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
}
if ((agent->session_params || agent->is_temp) && !force_destroy)
return;
+#endif
/*
* There are possible memory allocations in the core pooler, we want
@@ -1788,9 +2098,21 @@ agent_release_connections(PoolAgent *agent, bool force_destroy)
agent->coord_connections[i] = NULL;
}
+#ifdef XCP
+ /*
+ * Released connections are now in the pool and we may want to close
+ * them eventually. Update the oldest_idle value to reflect the latest
+ * last access time if not already updated..
+ */
+ if (!force_destroy && agent->pool->oldest_idle == (time_t) 0)
+ agent->pool->oldest_idle = time(NULL);
+#endif
+
MemoryContextSwitchTo(oldcontext);
}
+
+#ifndef XCP
/*
* Reset session parameters for given connections in the agent.
* This is done before putting back to pool connections that have been
@@ -1814,7 +2136,7 @@ agent_reset_session(PoolAgent *agent)
/* Reset given slot with parameters */
if (slot)
- PGXCNodeSendSetQuery(slot->conn, "SET SESSION AUTHORIZATION DEFAULT;RESET ALL;");
+ PGXCNodeSendSetQuery(slot->conn, "SET SESSION AUTHORIZATION DEFAULT;RESET ALL;SET GLOBAL_SESSION TO NONE;");
}
}
@@ -1827,7 +2149,7 @@ agent_reset_session(PoolAgent *agent)
/* Reset given slot with parameters */
if (slot)
- PGXCNodeSendSetQuery(slot->conn, "SET SESSION AUTHORIZATION DEFAULT;RESET ALL;");
+ PGXCNodeSendSetQuery(slot->conn, "SET SESSION AUTHORIZATION DEFAULT;RESET ALL;SET GLOBAL_SESSION TO NONE;");
}
}
@@ -1843,6 +2165,7 @@ agent_reset_session(PoolAgent *agent)
agent->local_params = NULL;
}
}
+#endif
/*
@@ -1853,8 +2176,13 @@ agent_reset_session(PoolAgent *agent)
* Returns POOL_OK if operation succeed POOL_FAIL in case of OutOfMemory
* error and POOL_WEXIST if poll for this database already exist.
*/
+#ifdef XCP
+static DatabasePool *create_database_pool(const char *database,
+ const char *user_name)
+#else
static DatabasePool *
create_database_pool(const char *database, const char *user_name, const char *pgoptions)
+#endif
{
MemoryContext oldcontext;
MemoryContext dbcontext;
@@ -1884,8 +2212,13 @@ create_database_pool(const char *database, const char *user_name, const char *pg
databasePool->database = pstrdup(database);
/* Copy the user name */
databasePool->user_name = pstrdup(user_name);
+#ifdef XCP
+ /* Reset the oldest_idle value */
+ databasePool->oldest_idle = (time_t) 0;
+#else
/* Copy the pgoptions */
databasePool->pgoptions = pstrdup(pgoptions);
+#endif
if (!databasePool->database)
{
@@ -2031,8 +2364,14 @@ reload_database_pools(PoolAgent *agent)
/*
* Find pool for specified database and username in the list
*/
+#ifdef XCP
+static DatabasePool *
+find_database_pool(const char *database,
+ const char *user_name)
+#else
static DatabasePool *
find_database_pool(const char *database, const char *user_name, const char *pgoptions)
+#endif
{
DatabasePool *databasePool;
@@ -2040,11 +2379,16 @@ find_database_pool(const char *database, const char *user_name, const char *pgop
databasePool = databasePools;
while (databasePool)
{
+#ifdef XCP
+ if (strcmp(database, databasePool->database) == 0 &&
+ strcmp(user_name, databasePool->user_name) == 0)
+ break;
+#else
if (strcmp(database, databasePool->database) == 0 &&
strcmp(user_name, databasePool->user_name) == 0 &&
strcmp(pgoptions, databasePool->pgoptions) == 0)
break;
-
+#endif
databasePool = databasePool->next;
}
return databasePool;
@@ -2185,6 +2529,9 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
{
/* Insert the slot into the array and increase pool size */
nodePool->slot[(nodePool->freeSize)++] = slot;
+#ifdef XCP
+ slot->released = time(NULL);
+#endif
}
else
{
@@ -2204,6 +2551,10 @@ release_connection(DatabasePool *dbPool, PGXCNodePoolSlot *slot,
static PGXCNodePool *
grow_pool(DatabasePool *dbPool, Oid node)
{
+#ifdef XCP
+ /* if error try to release idle connections and try again */
+ bool tryagain = true;
+#endif
PGXCNodePool *nodePool;
bool found;
@@ -2211,7 +2562,6 @@ grow_pool(DatabasePool *dbPool, Oid node)
nodePool = (PGXCNodePool *) hash_search(dbPool->nodePools, &node,
HASH_ENTER, &found);
-
if (!found)
{
nodePool->connstr = build_node_conn_str(node, dbPool);
@@ -2233,7 +2583,11 @@ grow_pool(DatabasePool *dbPool, Oid node)
nodePool->size = 0;
}
+#ifdef XCP
+ while (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize)
+#else
while (nodePool->size < MinPoolSize || (nodePool->freeSize == 0 && nodePool->size < MaxPoolSize))
+#endif
{
PGXCNodePoolSlot *slot;
@@ -2257,10 +2611,33 @@ grow_pool(DatabasePool *dbPool, Oid node)
ereport(LOG,
(errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("failed to connect to Datanode")));
+#ifdef XCP
+ /*
+ * If we failed to connect probably number of connections on the
+ * target node reached max_connections. Try and release idle
+ * connections and try again.
+ * We do not want to enter endless loop here and run maintenance
+ * procedure only once.
+ * It is not safe to run the maintenance procedure if no connections
+ * from that pool currently in use - the node pool may be destroyed
+ * in that case.
+ */
+ if (tryagain && nodePool->size > nodePool->freeSize)
+ {
+ pools_maintenance();
+ tryagain = false;
+ continue;
+ }
+#endif
break;
}
slot->xc_cancelConn = (NODE_CANCEL *) PQgetCancel((PGconn *)slot->conn);
+#ifdef XCP
+ slot->released = time(NULL);
+ if (dbPool->oldest_idle == (time_t) 0)
+ dbPool->oldest_idle = slot->released;
+#endif
/* Insert at the end of the pool */
nodePool->slot[(nodePool->freeSize)++] = slot;
@@ -2326,7 +2703,10 @@ destroy_node_pool(PGXCNodePool *node_pool)
static void
PoolerLoop(void)
{
- StringInfoData input_message;
+ StringInfoData input_message;
+#ifdef XCP
+ time_t last_maintenance = (time_t) 0;
+#endif
server_fd = pool_listen(PoolerPort, UnixSocketDir);
if (server_fd == -1)
@@ -2335,6 +2715,7 @@ PoolerLoop(void)
return;
}
initStringInfo(&input_message);
+
for (;;)
{
int nfds;
@@ -2365,8 +2746,53 @@ PoolerLoop(void)
nfds = Max(nfds, sockfd);
}
- /* wait for event */
+#ifdef XCP
+ if (PoolMaintenanceTimeout > 0)
+ {
+ struct timeval maintenance_timeout;
+ int timeout_val;
+ double timediff;
+
+ /*
+ * Decide the timeout value based on when the last
+ * maintenance activity was carried out. If the last
+ * maintenance was done quite a while ago schedule the select
+ * with no timeout. It will serve any incoming activity
+ * and if there's none it will cause the maintenance
+ * to be scheduled as soon as possible
+ */
+ timediff = difftime(time(NULL), last_maintenance);
+
+ if (timediff > PoolMaintenanceTimeout)
+ timeout_val = 0;
+ else
+ timeout_val = PoolMaintenanceTimeout - rint(timediff);
+
+ maintenance_timeout.tv_sec = timeout_val;
+ maintenance_timeout.tv_usec = 0;
+ /* wait for event */
+ retval = select(nfds + 1, &rfds, NULL, NULL, &maintenance_timeout);
+ }
+ else
+#endif
retval = select(nfds + 1, &rfds, NULL, NULL, NULL);
+#ifdef XCP
+ /*
+ * Emergency bailout if postmaster has died. This is to avoid the
+ * necessity for manual cleanup of all postmaster children.
+ */
+ if (!PostmasterIsAlive())
+ exit(1);
+
+ /*
+ * Process any requests or signals received recently.
+ */
+ if (got_SIGHUP)
+ {
+ got_SIGHUP = false;
+ ProcessConfigFile(PGC_SIGHUP);
+ }
+#endif
if (shutdown_requested)
{
for (i = agentCount - 1; i >= 0; i--)
@@ -2400,6 +2826,14 @@ PoolerLoop(void)
if (FD_ISSET(server_fd, &rfds))
agent_create();
}
+#ifdef XCP
+ else if (retval == 0)
+ {
+ /* maintenance timeout */
+ pools_maintenance();
+ last_maintenance = time(NULL);
+ }
+#endif
}
}
@@ -2530,6 +2964,17 @@ pooler_quickdie(SIGNAL_ARGS)
exit(2);
}
+
+#ifdef XCP
+static void
+pooler_sighup(SIGNAL_ARGS)
+{
+ got_SIGHUP = true;
+}
+#endif
+
+
+#ifndef XCP
bool
IsPoolHandle(void)
{
@@ -2537,7 +2982,7 @@ IsPoolHandle(void)
return false;
return true;
}
-
+#endif
/*
* Given node identifier, dbname and user name build connection string.
@@ -2556,13 +3001,145 @@ build_node_conn_str(Oid node, DatabasePool *dbPool)
return NULL;
}
+#ifdef XCP
+ connstr = PGXCNodeConnStr(NameStr(nodeDef->nodehost),
+ nodeDef->nodeport,
+ dbPool->database,
+ dbPool->user_name,
+ IS_PGXC_COORDINATOR ? "coordinator" : "datanode",
+ PGXCNodeName);
+#else
connstr = PGXCNodeConnStr(NameStr(nodeDef->nodehost),
nodeDef->nodeport,
dbPool->database,
dbPool->user_name,
dbPool->pgoptions,
IS_PGXC_COORDINATOR ? "coordinator" : "datanode");
+#endif
pfree(nodeDef);
return connstr;
}
+
+
+#ifdef XCP
+/*
+ * Check all pooled connections, and close which have been released more then
+ * PooledConnKeepAlive seconds ago.
+ * Return true if shrink operation closed all the connections and pool can be
+ * ddestroyed, false if there are still connections or pool is in use.
+ */
+static bool
+shrink_pool(DatabasePool *pool)
+{
+ time_t now = time(NULL);
+ HASH_SEQ_STATUS hseq_status;
+ PGXCNodePool *nodePool;
+ int i;
+ bool empty = true;
+
+ /* Negative PooledConnKeepAlive disables automatic connection cleanup */
+ if (PoolConnKeepAlive < 0)
+ return false;
+
+ pool->oldest_idle = (time_t) 0;
+ hash_seq_init(&hseq_status, pool->nodePools);
+ while ((nodePool = (PGXCNodePool *) hash_seq_search(&hseq_status)))
+ {
+ /* Go thru the free slots and destroy those that are free too long */
+ for (i = 0; i < nodePool->freeSize; )
+ {
+ PGXCNodePoolSlot *slot = nodePool->slot[i];
+
+ if (difftime(now, slot->released) > PoolConnKeepAlive)
+ {
+ /* connection is idle for long, close it */
+ destroy_slot(slot);
+ /* reduce pool size and total number of connections */
+ (nodePool->freeSize)--;
+ (nodePool->size)--;
+ /* move last connection in place, if not at last already */
+ if (i < nodePool->freeSize)
+ nodePool->slot[i] = nodePool->slot[nodePool->freeSize];
+ }
+ else
+ {
+ if (pool->oldest_idle == (time_t) 0 ||
+ difftime(pool->oldest_idle, slot->released) > 0)
+ pool->oldest_idle = slot->released;
+
+ i++;
+ }
+ }
+ if (nodePool->size > 0)
+ empty = false;
+ else
+ {
+ destroy_node_pool(nodePool);
+ hash_search(pool->nodePools, &nodePool->nodeoid, HASH_REMOVE, NULL);
+ }
+ }
+
+ /*
+ * Last check, if any active agent is referencing the pool do not allow to
+ * destroy it, because there will be a problem if session wakes up and try
+ * to get a connection from non existing pool.
+ * If all such sessions will eventually disconnect the pool will be
+ * destroyed during next maintenance procedure.
+ */
+ if (empty)
+ {
+ for (i = 0; i < agentCount; i++)
+ {
+ if (poolAgents[i]->pool == pool)
+ return false;
+ }
+ }
+
+ return empty;
+}
+
+
+/*
+ * Scan connection pools and release connections which are idle for long.
+ * If pool gets empty after releasing connections it is destroyed.
+ */
+static void
+pools_maintenance(void)
+{
+ DatabasePool *prev = NULL;
+ DatabasePool *curr = databasePools;
+ time_t now = time(NULL);
+ int count = 0;
+
+ /* Iterate over the pools */
+ while (curr)
+ {
+ /*
+ * If current pool has connections to close and it is emptied after
+ * shrink remove the pool and free memory.
+ * Otherwithe move to next pool.
+ */
+ if (curr->oldest_idle != (time_t) 0 &&
+ difftime(now, curr->oldest_idle) > PoolConnKeepAlive &&
+ shrink_pool(curr))
+ {
+ MemoryContext mem = curr->mcxt;
+ curr = curr->next;
+ if (prev)
+ prev->next = curr;
+ else
+ databasePools = curr;
+ MemoryContextDelete(mem);
+ count++;
+ }
+ else
+ {
+ prev = curr;
+ curr = curr->next;
+ }
+ }
+ elog(DEBUG1, "Pool maintenance, done in %f seconds, removed %d pools",
+ difftime(time(NULL), now), count);
+}
+#endif
diff --git a/src/backend/pgxc/pool/poolutils.c b/src/backend/pgxc/pool/poolutils.c
index 594aa71af4..e383845101 100644
--- a/src/backend/pgxc/pool/poolutils.c
+++ b/src/backend/pgxc/pool/poolutils.c
@@ -4,6 +4,11 @@
*
* Utilities for Postgres-XC pooler
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
@@ -26,8 +31,12 @@
#include "pgxc/pgxcnode.h"
#include "access/gtm.h"
#include "access/xact.h"
+#include "catalog/pgxc_node.h"
#include "commands/dbcommands.h"
#include "commands/prepare.h"
+#ifdef XCP
+#include "storage/ipc.h"
+#endif
#include "storage/procarray.h"
#include "utils/acl.h"
#include "utils/builtins.h"
@@ -88,18 +97,38 @@ pgxc_pool_check(PG_FUNCTION_ARGS)
Datum
pgxc_pool_reload(PG_FUNCTION_ARGS)
{
+#ifndef XCP
MemoryContext old_context;
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
(errmsg("must be superuser to manage pooler"))));
+#endif
if (IsTransactionBlock())
ereport(ERROR,
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
errmsg("pgxc_pool_reload cannot run inside a transaction block")));
+#ifdef XCP
+ /* Session is being reloaded, drop prepared and temporary objects */
+ DropAllPreparedStatements();
+
+ /* Reinitialize session, it updates the shared memory table */
+ InitMultinodeExecutor(true);
+
+ /* Be sure it is done consistently */
+ while (!PoolManagerCheckConnectionInfo())
+ {
+ /* Reload connection information in pooler */
+ PoolManagerReloadConnectionInfo();
+ }
+
+ /* Signal other sessions to reconnect to pooler if have privileges */
+ if (superuser())
+ ReloadConnInfoOnBackends();
+#else
/* A Datanode has no pooler active, so do not bother about that */
if (IS_PGXC_DATANODE)
PG_RETURN_BOOL(true);
@@ -145,6 +174,7 @@ pgxc_pool_reload(PG_FUNCTION_ARGS)
PoolManagerReconnect();
MemoryContextSwitchTo(old_context);
+#endif
PG_RETURN_BOOL(true);
}
@@ -289,6 +319,17 @@ CleanConnection(CleanConnStmt *stmt)
foreach(nodelist_item, stmt->nodes)
{
char *node_name = strVal(lfirst(nodelist_item));
+#ifdef XCP
+ char node_type = PGXC_NODE_NONE;
+ stmt_nodes = lappend_int(stmt_nodes,
+ PGXCNodeGetNodeIdFromName(node_name,
+ &node_type));
+ if (node_type == PGXC_NODE_NONE)
+ ereport(ERROR,
+ (errcode(ERRCODE_SYNTAX_ERROR),
+ errmsg("PGXC Node %s: object not defined",
+ node_name)));
+#else
Oid nodeoid = get_pgxc_nodeoid(node_name);
if (!OidIsValid(nodeoid))
@@ -299,6 +340,7 @@ CleanConnection(CleanConnStmt *stmt)
stmt_nodes = lappend_int(stmt_nodes,
PGXCNodeGetNodeId(nodeoid, get_pgxc_nodetype(nodeoid)));
+#endif
}
/* Build lists to be sent to Pooler Manager */
@@ -369,6 +411,20 @@ DropDBCleanConnection(char *dbname)
void
HandlePoolerReload(void)
{
+#ifdef XCP
+ if (proc_exit_inprogress)
+ return;
+
+ /* Request query cancel, when convenient */
+ InterruptPending = true;
+ QueryCancelPending = true;
+
+ /* Disconnect from the pooler to get new connection infos next time */
+ PoolManagerDisconnect();
+
+ /* Prevent using of cached connections to remote nodes */
+ RequestInvalidateRemoteHandles();
+#else
MemoryContext old_context;
/* A Datanode has no pooler active, so do not bother about that */
@@ -407,4 +463,5 @@ HandlePoolerReload(void)
CurrentResourceOwner = NULL;
MemoryContextSwitchTo(old_context);
+#endif
}
diff --git a/src/backend/pgxc/pool/postgresql_fdw.c b/src/backend/pgxc/pool/postgresql_fdw.c
new file mode 100644
index 0000000000..e6e80805a9
--- /dev/null
+++ b/src/backend/pgxc/pool/postgresql_fdw.c
@@ -0,0 +1,132 @@
+/*-------------------------------------------------------------------------
+ *
+ * postgresql_fdw.c
+ * foreign-data wrapper for PostgreSQL
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "pgxc/postgresql_fdw.h"
+#include "catalog/pg_operator.h"
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "funcapi.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "nodes/nodeFuncs.h"
+#include "nodes/makefuncs.h"
+#include "optimizer/clauses.h"
+#include "optimizer/planmain.h"
+#include "parser/scansup.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+#include "utils/syscache.h"
+
+#define DEBUG_FDW
+
+/*
+ * Check whether the function is IMMUTABLE.
+ */
+bool
+is_immutable_func(Oid funcid)
+{
+ HeapTuple tp;
+ bool isnull;
+ Datum datum;
+
+ tp = SearchSysCache(PROCOID, ObjectIdGetDatum(funcid), 0, 0, 0);
+ if (!HeapTupleIsValid(tp))
+ elog(ERROR, "cache lookup failed for function %u", funcid);
+
+#ifdef DEBUG_FDW
+ /* print function name and its immutability */
+ {
+ char *proname;
+ datum = SysCacheGetAttr(PROCOID, tp, Anum_pg_proc_proname, &isnull);
+ proname = pstrdup(DatumGetName(datum)->data);
+ elog(DEBUG1, "func %s(%u) is%s immutable", proname, funcid,
+ (DatumGetChar(datum) == PROVOLATILE_IMMUTABLE) ? "" : " not");
+ pfree(proname);
+ }
+#endif
+
+ datum = SysCacheGetAttr(PROCOID, tp, Anum_pg_proc_provolatile, &isnull);
+ ReleaseSysCache(tp);
+
+ return (DatumGetChar(datum) == PROVOLATILE_IMMUTABLE);
+}
+
+/*
+ * Check whether the ExprState node should be evaluated in foreign server.
+ *
+ * An expression which consists of expressions below will be evaluated in
+ * the foreign server.
+ * - constant value
+ * - variable (foreign table column)
+ * - external parameter (parameter of prepared statement)
+ * - array
+ * - bool expression (AND/OR/NOT)
+ * - NULL test (IS [NOT] NULL)
+ * - operator
+ * - IMMUTABLE only
+ * - It is required that the meaning of the operator be the same as the
+ * local server in the foreign server.
+ * - function
+ * - IMMUTABLE only
+ * - It is required that the meaning of the operator be the same as the
+ * local server in the foreign server.
+ * - scalar array operator (ANY/ALL)
+ */
+bool
+pgxc_is_expr_shippable(Expr *node, bool *has_aggs)
+{
+#ifdef XCP
+ return false;
+#else
+ Shippability_context sc_context;
+
+ /* Create the FQS context */
+ memset(&sc_context, 0, sizeof(sc_context));
+ sc_context.sc_query = NULL;
+ sc_context.sc_query_level = 0;
+ sc_context.sc_for_expr = true;
+
+ /* Walk the expression to check its shippability */
+ pgxc_shippability_walker((Node *)node, &sc_context);
+
+ /*
+ * If caller is interested in knowing, whether the expression has aggregets
+ * let the caller know about it. The caller is capable of handling such
+ * expressions. Otherwise assume such an expression as unshippable.
+ */
+ if (has_aggs)
+ *has_aggs = pgxc_test_shippability_reason(&sc_context, SS_HAS_AGG_EXPR);
+ else if (pgxc_test_shippability_reason(&sc_context, SS_HAS_AGG_EXPR))
+ return false;
+
+ /*
+ * If the expression unshippable or unsupported by expression shipping
+ * algorithm, return false. We don't have information about the number of
+ * nodes involved in expression evaluation, hence even if the expression can
+ * be evaluated only on single node, return false.
+ */
+ if (pgxc_test_shippability_reason(&sc_context, SS_UNSUPPORTED_EXPR) ||
+ pgxc_test_shippability_reason(&sc_context, SS_UNSHIPPABLE_EXPR) ||
+ pgxc_test_shippability_reason(&sc_context, SS_NEED_SINGLENODE))
+ return false;
+
+ /* If nothing wrong found, the expression is shippable */
+ return true;
+#endif
+}
diff --git a/src/backend/pgxc/squeue/Makefile b/src/backend/pgxc/squeue/Makefile
new file mode 100644
index 0000000000..77d568813b
--- /dev/null
+++ b/src/backend/pgxc/squeue/Makefile
@@ -0,0 +1,19 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for distributed executor's shared memory queue
+#
+# Portions Copyright (c) 2011 StormDB
+#
+# IDENTIFICATION
+# $PostgreSQL$
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/pgxc/squeue
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = squeue.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/squeue/squeue.c b/src/backend/pgxc/squeue/squeue.c
new file mode 100644
index 0000000000..0418779199
--- /dev/null
+++ b/src/backend/pgxc/squeue/squeue.c
@@ -0,0 +1,1509 @@
+/*-------------------------------------------------------------------------
+ *
+ * squeue.c
+ *
+ * Shared queue is for data exchange in shared memory between sessions,
+ * one of which is a producer, providing data rows. Others are consumer agents -
+ * sessions initiated from other datanodes, the main purpose of them is to read
+ * rows from the shared queue and send then to the parent data node.
+ * The producer is usually a consumer at the same time, it sends back tuples
+ * to the parent node without putting it to the queue.
+ *
+ * Copyright (c) 2012-2014, TransLattice, Inc.
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <sys/time.h>
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "access/gtm.h"
+#include "catalog/pgxc_node.h"
+#include "commands/prepare.h"
+#include "executor/executor.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/pgxc.h"
+#include "pgxc/pgxcnode.h"
+#include "pgxc/squeue.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "utils/hsearch.h"
+#include "utils/resowner.h"
+
+
+int NSQueues = 64;
+int SQueueSize = 64;
+
+#define LONG_TUPLE -42
+
+typedef struct ConsumerSync
+{
+ LWLockId cs_lwlock; /* Synchronize access to the consumer queue */
+ Latch cs_latch; /* The latch consumer is waiting on */
+} ConsumerSync;
+
+
+/*
+ * Shared memory structure to store synchronization info to access shared queues
+ */
+typedef struct SQueueSync
+{
+ void *queue; /* NULL if not assigned to any queue */
+ Latch sqs_producer_latch; /* the latch producer is waiting on */
+ ConsumerSync sqs_consumer_sync[0]; /* actual length is MaxDataNodes-1 is
+ * not known on compile time */
+} SQueueSync;
+
+/* Both producer and consumer are working */
+#define CONSUMER_ACTIVE 0
+/* Producer have finished work successfully and waits for consumer */
+#define CONSUMER_EOF 1
+/* Producer encountered error and waits for consumer to disconnect */
+#define CONSUMER_ERROR 2
+/* Consumer is finished with the query, OK to unbind */
+#define CONSUMER_DONE 3
+
+
+/* State of a single consumer */
+typedef struct
+{
+ int cs_pid; /* Process id of the consumer session */
+ int cs_node; /* Node id of the consumer parent */
+ /*
+ * Queue state. The queue is a cyclic queue where stored tuples in the
+ * DataRow format, first goes the lengths of the tuple in host format,
+ * because it never sent over network followed by tuple bytes.
+ */
+ int cs_ntuples; /* Number of tuples in the queue */
+ int cs_status; /* See CONSUMER_* defines above */
+ char *cs_qstart; /* Where consumer queue begins */
+ int cs_qlength; /* The size of the consumer queue */
+ int cs_qreadpos; /* The read position in the consumer queue */
+ int cs_qwritepos; /* The write position in the consumer queue */
+#ifdef SQUEUE_STAT
+ long stat_writes;
+ long stat_reads;
+ long stat_buff_writes;
+ long stat_buff_reads;
+ long stat_buff_returns;
+#endif
+} ConsState;
+
+/* Shared queue header */
+typedef struct SQueueHeader
+{
+ char sq_key[SQUEUE_KEYSIZE]; /* Hash entry key should be at the
+ * beginning of the hash entry */
+ int sq_pid; /* Process id of the producer session */
+ int sq_nodeid; /* Node id of the producer parent */
+ SQueueSync *sq_sync; /* Associated sinchronization objects */
+#ifdef SQUEUE_STAT
+ bool stat_finish;
+ long stat_paused;
+#endif
+ int sq_nconsumers; /* Number of consumers */
+ ConsState sq_consumers[0];/* variable length array */
+} SQueueHeader;
+
+
+/*
+ * Hash table where all shared queues are stored. Key is the queue name, value
+ * is SharedQueue
+ */
+static HTAB *SharedQueues = NULL;
+
+
+/*
+ * Pool of synchronization items
+ */
+static void *SQueueSyncs;
+
+#define SQUEUE_SYNC_SIZE \
+ (sizeof(SQueueSync) + (MaxDataNodes-1) * sizeof(ConsumerSync))
+
+#define GET_SQUEUE_SYNC(idx) \
+ ((SQueueSync *) (((char *) SQueueSyncs) + (idx) * SQUEUE_SYNC_SIZE))
+
+#define SQUEUE_HDR_SIZE(nconsumers) \
+ (sizeof(SQueueHeader) + (nconsumers) * sizeof(ConsState))
+
+#define QUEUE_FREE_SPACE(cstate) \
+ ((cstate)->cs_ntuples > 0 ? \
+ ((cstate)->cs_qreadpos >= (cstate)->cs_qwritepos ? \
+ (cstate)->cs_qreadpos - (cstate)->cs_qwritepos : \
+ (cstate)->cs_qlength + (cstate)->cs_qreadpos \
+ - (cstate)->cs_qwritepos) \
+ : (cstate)->cs_qlength)
+
+#define QUEUE_WRITE(cstate, len, buf) \
+ do \
+ { \
+ if ((cstate)->cs_qwritepos + (len) <= (cstate)->cs_qlength) \
+ { \
+ memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, len); \
+ (cstate)->cs_qwritepos += (len); \
+ if ((cstate)->cs_qwritepos == (cstate)->cs_qlength) \
+ (cstate)->cs_qwritepos = 0; \
+ } \
+ else \
+ { \
+ int part = (cstate)->cs_qlength - (cstate)->cs_qwritepos; \
+ memcpy((cstate)->cs_qstart + (cstate)->cs_qwritepos, buf, part); \
+ (cstate)->cs_qwritepos = (len) - part; \
+ memcpy((cstate)->cs_qstart, (buf) + part, (cstate)->cs_qwritepos); \
+ } \
+ } while(0)
+
+
+#define QUEUE_READ(cstate, len, buf) \
+ do \
+ { \
+ if ((cstate)->cs_qreadpos + (len) <= (cstate)->cs_qlength) \
+ { \
+ memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, len); \
+ (cstate)->cs_qreadpos += (len); \
+ if ((cstate)->cs_qreadpos == (cstate)->cs_qlength) \
+ (cstate)->cs_qreadpos = 0; \
+ } \
+ else \
+ { \
+ int part = (cstate)->cs_qlength - (cstate)->cs_qreadpos; \
+ memcpy(buf, (cstate)->cs_qstart + (cstate)->cs_qreadpos, part); \
+ (cstate)->cs_qreadpos = (len) - part; \
+ memcpy((buf) + part, (cstate)->cs_qstart, (cstate)->cs_qreadpos); \
+ } \
+ } while(0)
+
+
+static bool sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow);
+static void sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow,
+ ConsumerSync *sync);
+
+/*
+ * SharedQueuesInit
+ * Initialize the reference on the shared memory hash table where all shared
+ * queues are stored. Invoked during postmaster initialization.
+ */
+void
+SharedQueuesInit(void)
+{
+ HASHCTL info;
+ int hash_flags;
+ bool found;
+
+ info.keysize = SQUEUE_KEYSIZE;
+ info.entrysize = SQUEUE_SIZE;
+ hash_flags = HASH_ELEM;
+
+ SharedQueues = ShmemInitHash("Shared Queues", NUM_SQUEUES,
+ NUM_SQUEUES, &info, hash_flags);
+
+ /*
+ * Synchronization stuff is in separate structure because we need to
+ * initialize all items now while in the postmaster.
+ * The structure is actually an array, each array entry is assigned to
+ * each instance of SharedQueue in use.
+ */
+ SQueueSyncs = ShmemInitStruct("Shared Queues Sync",
+ SQUEUE_SYNC_SIZE * NUM_SQUEUES,
+ &found);
+ if (!found)
+ {
+ int i;
+
+ for (i = 0; i < NUM_SQUEUES; i++)
+ {
+ SQueueSync *sqs = GET_SQUEUE_SYNC(i);
+ int j;
+
+ sqs->queue = NULL;
+ InitSharedLatch(&sqs->sqs_producer_latch);
+ for (j = 0; j < MaxDataNodes-1; j++)
+ {
+ InitSharedLatch(&sqs->sqs_consumer_sync[j].cs_latch);
+ sqs->sqs_consumer_sync[j].cs_lwlock = LWLockAssign();
+ }
+ }
+ }
+}
+
+
+Size
+SharedQueueShmemSize(void)
+{
+ Size sq_size;
+ Size sqs_size;
+
+ sq_size = mul_size(NUM_SQUEUES, SQUEUE_SIZE);
+ sqs_size = mul_size(NUM_SQUEUES, SQUEUE_SYNC_SIZE);
+
+ return add_size(sq_size, sqs_size);
+}
+
+/*
+ * SharedQueueAcquire
+ * Reserve a named shared queue for future data exchange between processes
+ * supplying tuples to remote Datanodes. Invoked when a remote query plan is
+ * registered on the Datanode. The number of consumers is known at this point,
+ * so shared queue may be formatted during reservation. The first process that
+ * is acquiring the shared queue on the Datanode does the formatting.
+ */
+void
+SharedQueueAcquire(const char *sqname, int ncons)
+{
+ bool found;
+ SharedQueue sq;
+
+ Assert(IsConnFromDatanode());
+ Assert(ncons > 0);
+
+tryagain:
+ LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+
+ sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_ENTER, &found);
+ /* First process acquiring queue should format it */
+ if (!found)
+ {
+ int qsize; /* Size of one queue */
+ int i;
+ char *heapPtr;
+
+ elog(LOG, "Format squeue %s for %d consumers", sqname, ncons);
+
+ /* Initialize the shared queue */
+ sq->sq_pid = 0;
+ sq->sq_nodeid = -1;
+#ifdef SQUEUE_STAT
+ sq->stat_finish = false;
+ sq->stat_paused = 0;
+#endif
+ /*
+ * Assign sync object (latches to wait on)
+ * XXX We may want to optimize this and do smart search instead of
+ * iterating the array.
+ */
+ for (i = 0; i < NUM_SQUEUES; i++)
+ {
+ SQueueSync *sqs = GET_SQUEUE_SYNC(i);
+ if (sqs->queue == NULL)
+ {
+ sqs->queue = (void *) sq;
+ sq->sq_sync = sqs;
+ break;
+ }
+ }
+
+ sq->sq_nconsumers = ncons;
+ /* Determine queue size for a single consumer */
+ qsize = (SQUEUE_SIZE - SQUEUE_HDR_SIZE(sq->sq_nconsumers)) / sq->sq_nconsumers;
+
+ heapPtr = (char *) sq;
+ /* Skip header */
+ heapPtr += SQUEUE_HDR_SIZE(sq->sq_nconsumers);
+ /* Set up consumer queues */
+ for (i = 0; i < ncons; i++)
+ {
+ ConsState *cstate = &(sq->sq_consumers[i]);
+
+ cstate->cs_pid = 0;
+ cstate->cs_node = -1;
+ cstate->cs_ntuples = 0;
+ cstate->cs_status = CONSUMER_ACTIVE;
+ cstate->cs_qstart = heapPtr;
+ cstate->cs_qlength = qsize;
+ cstate->cs_qreadpos = 0;
+ cstate->cs_qwritepos = 0;
+ heapPtr += qsize;
+ }
+ Assert(heapPtr <= ((char *) sq) + SQUEUE_SIZE);
+ }
+ else
+ {
+ /*
+ * A race condition is possible here. The previous operation might use
+ * the same Shared Queue name if that was different execution of the
+ * same Portal. So here we should try to determine if that Shared Queue
+ * belongs to this execution or that is not-yet-released Shared Queue
+ * of previous operation.
+ * Though at the moment I am not sure, but I believe the BIND stage is
+ * only happening after completion of ACQUIRE stage, so it is enough
+ * to verify the producer (the very first node that binds) is not bound
+ * yet. If it is bound, sleep for a moment and try again. No reason to
+ * sleep longer, the producer needs just a quantum of CPU time to UNBIND
+ * itself.
+ */
+ if (sq->sq_pid != 0)
+ {
+ int selfid; /* Node Id of the parent data node */
+ int i;
+ char ntype = PGXC_NODE_DATANODE;
+ bool old_squeue = true;
+
+ selfid = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE, &ntype);
+ for (i = 0; i < sq->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &(sq->sq_consumers[i]);
+ if (cstate->cs_node == selfid)
+ {
+ SQueueSync *sqsync = sq->sq_sync;
+
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ LW_EXCLUSIVE);
+ /* verify status */
+ if (cstate->cs_status != CONSUMER_DONE)
+ old_squeue = false;
+
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ break;
+ }
+ }
+ if (old_squeue)
+ {
+ LWLockRelease(SQueuesLock);
+ pg_usleep(1L);
+ goto tryagain;
+ }
+
+ }
+ }
+ LWLockRelease(SQueuesLock);
+}
+
+
+/*
+ * SharedQueueBind
+ * Bind to the shared queue specified by sqname either as a consumer or as a
+ * producer. The first process that binds to the shared queue becomes a producer
+ * and receives the consumer map, others become consumers and receive queue
+ * indexes to read tuples from.
+ * The consNodes int list identifies the nodes involved in the current step.
+ * The distNodes int list describes result distribution of the current step.
+ * The consNodes should be a subset of distNodes.
+ * The myindex and consMap parameters are binding results. If caller process
+ * is bound to the query as a producer myindex is set to -1 and index of the
+ * each consumer (order number in the consNodes) is stored to the consMap array
+ * at the position of the node in the distNodes. For the producer node
+ * SQ_CONS_SELF is stored, nodes from distNodes list which are not members of
+ * consNodes or if it was reported they won't read results, they are represented
+ * as SQ_CONS_NONE.
+ */
+SharedQueue
+SharedQueueBind(const char *sqname, List *consNodes,
+ List *distNodes, int *myindex, int *consMap)
+{
+ bool found;
+ SharedQueue sq;
+ int selfid; /* Node Id of the parent data node */
+ char ntype = PGXC_NODE_DATANODE;
+
+ LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+
+ selfid = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE, &ntype);
+
+ sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found);
+ if (!found)
+ elog(PANIC, "Shared queue %s not found", sqname);
+ if (sq->sq_pid == 0)
+ {
+ /* Producer */
+ int i;
+ ListCell *lc;
+
+ Assert(consMap);
+
+ elog(LOG, "Bind node %s to squeue of step %s as a producer",
+ PGXC_PARENT_NODE, sqname);
+
+ /* Initialize the shared queue */
+ sq->sq_pid = MyProcPid;
+ sq->sq_nodeid = selfid;
+ OwnLatch(&sq->sq_sync->sqs_producer_latch);
+
+ i = 0;
+ foreach(lc, distNodes)
+ {
+ int nodeid = lfirst_int(lc);
+
+ /*
+ * Producer won't go to shared queue to hand off tuple to itself,
+ * so we do not need to create queue for that entry.
+ */
+ if (nodeid == selfid)
+ {
+ /* Producer must be in the consNodes list */
+ Assert(list_member_int(consNodes, nodeid));
+ consMap[i++] = SQ_CONS_SELF;
+ }
+ /*
+ * This node may connect as a consumer, store consumer id to the map
+ * and initialize consumer queue
+ */
+ else if (list_member_int(consNodes, nodeid))
+ {
+ ConsState *cstate;
+ int j;
+
+ for (j = 0; j < sq->sq_nconsumers; j++)
+ {
+ cstate = &(sq->sq_consumers[j]);
+ if (cstate->cs_node == nodeid)
+ {
+ /* The process already reported that queue won't read */
+ elog(LOG, "Node %d of step %s is released already",
+ nodeid, sqname);
+ consMap[i++] = SQ_CONS_NONE;
+ break;
+ }
+ else if (cstate->cs_node == -1)
+ {
+ /* found unused slot, assign the consumer to it */
+ consMap[i++] = j;
+ cstate->cs_node = nodeid;
+ break;
+ }
+ }
+ }
+ /*
+ * Consumer from this node won't ever connect as upper level step
+ * is not executed on the node. Discard resuls that may go to that
+ * node, if any.
+ */
+ else
+ {
+ consMap[i++] = SQ_CONS_NONE;
+ }
+ }
+
+ if (myindex)
+ *myindex = -1;
+ }
+ else
+ {
+ int nconsumers;
+ ListCell *lc;
+
+ /* Producer should be different process */
+ Assert(sq->sq_pid != MyProcPid);
+
+ elog(LOG, "Bind node %s to squeue of step %s as a consumer of process %d", PGXC_PARENT_NODE, sqname, sq->sq_pid);
+
+ /* Sanity checks */
+ Assert(myindex);
+ *myindex = -1;
+ /* Ensure the passed in consumer list matches the queue */
+ nconsumers = 0;
+ foreach (lc, consNodes)
+ {
+ int nodeid = lfirst_int(lc);
+ int i;
+
+ if (nodeid == sq->sq_nodeid)
+ {
+ /*
+ * This node is a producer it should be in the consumer list,
+ * but no consumer queue for it
+ */
+ continue;
+ }
+
+ /* find consumer queue for the node */
+ for (i = 0; i < sq->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &(sq->sq_consumers[i]);
+ if (cstate->cs_node == nodeid)
+ {
+ nconsumers++;
+ if (nodeid == selfid)
+ {
+ /*
+ * Current consumer queue is that from which current
+ * session will be sending out data rows.
+ * Initialize the queue to let producer know we are
+ * here and runnng.
+ */
+ SQueueSync *sqsync = sq->sq_sync;
+
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ LW_EXCLUSIVE);
+ /* Make sure no consumer bound to the queue already */
+ Assert(cstate->cs_pid == 0);
+ /* make sure the queue is ready to read */
+ Assert(cstate->cs_qlength > 0);
+ /* verify status */
+ if (cstate->cs_status == CONSUMER_ERROR ||
+ cstate->cs_status == CONSUMER_DONE)
+ {
+ /*
+ * Producer failed by the time the consumer connect.
+ * Change status to "Done" to allow producer unbind
+ * and report problem to the parent.
+ */
+ cstate->cs_status = CONSUMER_DONE;
+ /* Producer may be waiting for status change */
+ SetLatch(&sqsync->sqs_producer_latch);
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ LWLockRelease(SQueuesLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_PRODUCER_ERROR),
+ errmsg("producer error")));
+ }
+ /*
+ * Any other status is acceptable. Normally it would be
+ * ACTIVE. If producer have had only few rows to emit
+ * and it is already done the status would be EOF.
+ */
+ /* Set up the consumer */
+ cstate->cs_pid = MyProcPid;
+ /* return found index */
+ *myindex = i;
+ OwnLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ break;
+ }
+ }
+ /* Check if entry was found and therefore loop was broken */
+ Assert(i < sq->sq_nconsumers);
+ }
+ /* Check the consumer is found */
+ Assert(*myindex != -1);
+ Assert(sq->sq_nconsumers == nconsumers);
+ }
+ LWLockRelease(SQueuesLock);
+ return sq;
+}
+
+
+/*
+ * Push data from the local tuplestore to the queue for specified consumer.
+ * Return true if succeeded and the tuplestore is now empty. Return false
+ * if specified queue has not enough room for the next tuple.
+ */
+static bool
+SharedQueueDump(SharedQueue squeue, int consumerIdx,
+ TupleTableSlot *tmpslot, Tuplestorestate *tuplestore)
+{
+ ConsState *cstate = &(squeue->sq_consumers[consumerIdx]);
+
+ /* discard stored data if consumer is not active */
+ if (cstate->cs_status != CONSUMER_ACTIVE)
+ {
+ tuplestore_clear(tuplestore);
+ return true;
+ }
+
+ /*
+ * Tuplestore does not clear eof flag on the active read pointer, causing
+ * the store is always in EOF state once reached when there is a single
+ * read pointer. We do not want behavior like this and workaround by using
+ * secondary read pointer. Primary read pointer (0) is active when we are
+ * writing to the tuple store, also it is used to bookmark current position
+ * when reading to be able to roll back and return just read tuple back to
+ * the store if we failed to write it out to the queue.
+ * Secondary read pointer is for reading, and its eof flag is cleared if a
+ * tuple is written to the store.
+ */
+ tuplestore_select_read_pointer(tuplestore, 1);
+
+ /* If we have something in the tuplestore try to push this to the queue */
+ while (!tuplestore_ateof(tuplestore))
+ {
+ /* save position */
+ tuplestore_copy_read_pointer(tuplestore, 1, 0);
+
+ /* Try to get next tuple to the temporary slot */
+ if (!tuplestore_gettupleslot(tuplestore, true, false, tmpslot))
+ {
+ /* false means the tuplestore in EOF state */
+ break;
+ }
+#ifdef SQUEUE_STAT
+ cstate->stat_buff_reads++;
+#endif
+
+ /* The slot should contain a data row */
+ Assert(tmpslot->tts_datarow);
+
+ /* check if queue has enough room for the data */
+ if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + tmpslot->tts_datarow->msglen)
+ {
+ /*
+ * If stored tuple does not fit empty queue we are entering special
+ * procedure of pushing it through.
+ */
+ if (cstate->cs_ntuples <= 0)
+ {
+ /*
+ * If pushing throw is completed wake up and proceed to next
+ * tuple, there could be enough space in the consumer queue to
+ * fit more.
+ */
+ bool done = sq_push_long_tuple(cstate, tmpslot->tts_datarow);
+
+ /*
+ * sq_push_long_tuple writes some data anyway, so wake up
+ * the consumer.
+ */
+ SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch);
+
+ if (done)
+ continue;
+ }
+
+ /* Restore read position to get same tuple next time */
+ tuplestore_copy_read_pointer(tuplestore, 0, 1);
+#ifdef SQUEUE_STAT
+ cstate->stat_buff_returns++;
+#endif
+
+ /* We might advance the mark, try to truncate */
+ tuplestore_trim(tuplestore);
+
+ /* Prepare for writing, set proper read pointer */
+ tuplestore_select_read_pointer(tuplestore, 0);
+
+ /* ... and exit */
+ return false;
+ }
+ else
+ {
+ /* Enqueue data */
+ QUEUE_WRITE(cstate, sizeof(int), (char *) &tmpslot->tts_datarow->msglen);
+ QUEUE_WRITE(cstate, tmpslot->tts_datarow->msglen, tmpslot->tts_datarow->msg);
+
+ /* Increment tuple counter. If it was 0 consumer may be waiting for
+ * data so try to wake it up */
+ if ((cstate->cs_ntuples)++ == 0)
+ SetLatch(&squeue->sq_sync->sqs_consumer_sync[consumerIdx].cs_latch);
+ }
+ }
+
+ /* Remove rows we have just read */
+ tuplestore_trim(tuplestore);
+
+ /* prepare for writes, set read pointer 0 as active */
+ tuplestore_select_read_pointer(tuplestore, 0);
+
+ return true;
+}
+
+
+/*
+ * SharedQueueWrite
+ * Write data from the specified slot to the specified queue. If the
+ * tuplestore passed in has tuples try and write them first.
+ * If specified queue is full the tuple is put into the tuplestore which is
+ * created if necessary
+ */
+void
+SharedQueueWrite(SharedQueue squeue, int consumerIdx,
+ TupleTableSlot *slot, Tuplestorestate **tuplestore,
+ MemoryContext tmpcxt)
+{
+ ConsState *cstate = &(squeue->sq_consumers[consumerIdx]);
+ SQueueSync *sqsync = squeue->sq_sync;
+ LWLockId clwlock = sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock;
+ RemoteDataRow datarow;
+ bool free_datarow;
+
+ Assert(cstate->cs_qlength > 0);
+
+ LWLockAcquire(clwlock, LW_EXCLUSIVE);
+
+#ifdef SQUEUE_STAT
+ cstate->stat_writes++;
+#endif
+
+ /*
+ * If we have anything in the local storage try to dump this first,
+ * but do not try to dump often to avoid overhead of creating temporary
+ * tuple slot. It should be OK to dump if queue is half empty.
+ */
+ if (*tuplestore)
+ {
+ bool dumped = false;
+
+ if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2)
+ {
+ TupleTableSlot *tmpslot;
+
+ tmpslot = MakeSingleTupleTableSlot(slot->tts_tupleDescriptor);
+ dumped = SharedQueueDump(squeue, consumerIdx, tmpslot, *tuplestore);
+ ExecDropSingleTupleTableSlot(tmpslot);
+ }
+ if (!dumped)
+ {
+ /* No room to even dump local store, append the tuple to the store
+ * and exit */
+#ifdef SQUEUE_STAT
+ cstate->stat_buff_writes++;
+#endif
+ LWLockRelease(clwlock);
+ tuplestore_puttupleslot(*tuplestore, slot);
+ return;
+ }
+ }
+
+ /* Get datarow from the tuple slot */
+ if (slot->tts_datarow)
+ {
+ /*
+ * The function ExecCopySlotDatarow always make a copy, but here we
+ * can optimize and avoid copying the data, so we just get the reference
+ */
+ datarow = slot->tts_datarow;
+ free_datarow = false;
+ }
+ else
+ {
+ datarow = ExecCopySlotDatarow(slot, tmpcxt);
+ free_datarow = true;
+ }
+ if (QUEUE_FREE_SPACE(cstate) < sizeof(int) + datarow->msglen)
+ {
+ /* Not enough room, store tuple locally */
+ LWLockRelease(clwlock);
+
+ /* clean up */
+ if (free_datarow)
+ pfree(datarow);
+
+ /* Create tuplestore if does not exist */
+ if (*tuplestore == NULL)
+ {
+ int ptrno;
+ char storename[64];
+
+#ifdef SQUEUE_STAT
+ elog(LOG, "Start buffering %s node %d, %d tuples in queue, %ld writes and %ld reads so far",
+ squeue->sq_key, cstate->cs_node, cstate->cs_ntuples, cstate->stat_writes, cstate->stat_reads);
+#endif
+ *tuplestore = tuplestore_begin_datarow(false, work_mem, tmpcxt);
+ /* We need is to be able to remember/restore the read position */
+ snprintf(storename, 64, "%s node %d", squeue->sq_key, cstate->cs_node);
+ tuplestore_collect_stat(*tuplestore, storename);
+ /*
+ * Allocate a second read pointer to read from the store. We know
+ * it must have index 1, so needn't store that.
+ */
+ ptrno = tuplestore_alloc_read_pointer(*tuplestore, 0);
+ Assert(ptrno == 1);
+ }
+
+#ifdef SQUEUE_STAT
+ cstate->stat_buff_writes++;
+#endif
+ /* Append the slot to the store... */
+ tuplestore_puttupleslot(*tuplestore, slot);
+
+ /* ... and exit */
+ return;
+ }
+ else
+ {
+ /* do not supply data to closed consumer */
+ if (cstate->cs_status == CONSUMER_ACTIVE)
+ {
+ /* write out the data */
+ QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen);
+ QUEUE_WRITE(cstate, datarow->msglen, datarow->msg);
+ /* Increment tuple counter. If it was 0 consumer may be waiting for
+ * data so try to wake it up */
+ if ((cstate->cs_ntuples)++ == 0)
+ SetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ }
+
+ /* clean up */
+ if (free_datarow)
+ pfree(datarow);
+ }
+ LWLockRelease(clwlock);
+}
+
+
+/*
+ * SharedQueueRead
+ * Read one data row from the specified queue into the provided tupleslot.
+ * Returns true if EOF is reached on the specified consumer queue.
+ * If the queue is empty, behavior is controlled by the canwait parameter.
+ * If canwait is true it is waiting while row is available or EOF or error is
+ * reported, if it is false, the slot is emptied and false is returned.
+ */
+bool
+SharedQueueRead(SharedQueue squeue, int consumerIdx,
+ TupleTableSlot *slot, bool canwait)
+{
+ ConsState *cstate = &(squeue->sq_consumers[consumerIdx]);
+ SQueueSync *sqsync = squeue->sq_sync;
+ RemoteDataRow datarow;
+ int datalen;
+
+ Assert(cstate->cs_qlength > 0);
+
+ LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE);
+
+ Assert(cstate->cs_status != CONSUMER_DONE);
+ while (cstate->cs_ntuples <= 0)
+ {
+ if (cstate->cs_status == CONSUMER_EOF)
+ {
+ /* Inform producer the consumer have done the job */
+ cstate->cs_status = CONSUMER_DONE;
+ /* no need to receive notifications */
+ DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ /* producer done the job and no more rows expected, clean up */
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ ExecClearTuple(slot);
+ /*
+ * notify the producer, it may be waiting while consumers
+ * are finishing
+ */
+ SetLatch(&sqsync->sqs_producer_latch);
+ elog(LOG, "EOF reached while reading from squeue, exiting");
+ return true;
+ }
+ else if (cstate->cs_status == CONSUMER_ERROR)
+ {
+ /*
+ * There was a producer error while waiting.
+ * Release all the locks and report problem to the caller.
+ */
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ /*
+ * Reporting error will cause transaction rollback and clean up of
+ * all portals. We can not mark the portal so it does not access
+ * the queue so we should hold it for now. We should prevent queue
+ * unbound in between.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_PRODUCER_ERROR),
+ errmsg("producer error")));
+ }
+ if (canwait)
+ {
+ /* Prepare waiting on empty buffer */
+ ResetLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ /* Wait for notification about available info */
+ WaitLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1);
+ /* got the notification, restore lock and try again */
+ LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock, LW_EXCLUSIVE);
+ }
+ else
+ {
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ ExecClearTuple(slot);
+ return false;
+ }
+ }
+ /* have at least one row, read it in and store to slot */
+ QUEUE_READ(cstate, sizeof(int), (char *) (&datalen));
+ datarow = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + datalen);
+ datarow->msgnode = InvalidOid;
+ datarow->msglen = datalen;
+ if (datalen > cstate->cs_qlength - sizeof(int))
+ sq_pull_long_tuple(cstate, datarow,
+ &sqsync->sqs_consumer_sync[consumerIdx]);
+ else
+ QUEUE_READ(cstate, datalen, datarow->msg);
+ ExecStoreDataRowTuple(datarow, slot, true);
+ (cstate->cs_ntuples)--;
+#ifdef SQUEUE_STAT
+ cstate->stat_reads++;
+#endif
+ /* sanity check */
+ Assert((cstate->cs_ntuples == 0) == (cstate->cs_qreadpos == cstate->cs_qwritepos));
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ return false;
+}
+
+
+/*
+ * Mark specified consumer as closed discarding all input which may already be
+ * in the queue.
+ * If consumerIdx is -1 the producer is cleaned up. Producer need to wait for
+ * consumers before releasing the queue, so if there are yet active consumers,
+ * they are notified about the problem and they should disconnect from the
+ * queue as soon as possible.
+ */
+void
+SharedQueueReset(SharedQueue squeue, int consumerIdx)
+{
+ SQueueSync *sqsync = squeue->sq_sync;
+
+ if (consumerIdx == -1)
+ {
+ int i;
+
+ /* check queue states */
+ for (i = 0; i < squeue->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &squeue->sq_consumers[i];
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+
+ /*
+ * If producer being reset before it is reached the end of the
+ * result set, that means consumer probably would not get all
+ * the rows and it should report error if the consumer's parent ever
+ * try to read. No need to raise error if consumer is just closed.
+ * If consumer is done already we do not need to change the status.
+ */
+ if (cstate->cs_status != CONSUMER_EOF &&
+ cstate->cs_status != CONSUMER_DONE)
+ {
+ elog(LOG, "Consumer %d of producer %s is cancelled", i, squeue->sq_key);
+ cstate->cs_status = CONSUMER_ERROR;
+ /* discard tuples which may already be in the queue */
+ cstate->cs_ntuples = 0;
+ /* keep consistent with cs_ntuples*/
+ cstate->cs_qreadpos = cstate->cs_qwritepos = 0;
+
+ /* wake up consumer if it is sleeping */
+ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ }
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ elog(LOG, "Reset producer %s", squeue->sq_key);
+ }
+ else
+ {
+ ConsState *cstate = &(squeue->sq_consumers[consumerIdx]);
+ LWLockAcquire(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock,
+ LW_EXCLUSIVE);
+
+ if (cstate->cs_status != CONSUMER_DONE)
+ {
+ /* Inform producer the consumer have done the job */
+ cstate->cs_status = CONSUMER_DONE;
+ /*
+ * No longer need to receive notifications. If consumer has not
+ * connected the latch is not owned
+ */
+ if (cstate->cs_pid > 0)
+ DisownLatch(&sqsync->sqs_consumer_sync[consumerIdx].cs_latch);
+ /*
+ * notify the producer, it may be waiting while consumers
+ * are finishing
+ */
+ SetLatch(&sqsync->sqs_producer_latch);
+ elog(LOG, "Reset consumer %d of %s", consumerIdx, squeue->sq_key);
+ }
+
+ LWLockRelease(sqsync->sqs_consumer_sync[consumerIdx].cs_lwlock);
+ }
+}
+
+
+/*
+ * Assume that not yet connected consumers won't connect and reset them.
+ * That should allow to Finish/UnBind the queue gracefully and prevent
+ * producer hanging.
+ */
+int
+SharedQueueResetNotConnected(SharedQueue squeue)
+{
+ SQueueSync *sqsync = squeue->sq_sync;
+ int result = 0;
+ int i;
+
+ /* check queue states */
+ for (i = 0; i < squeue->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &squeue->sq_consumers[i];
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+
+ if (cstate->cs_pid == 0 &&
+ cstate->cs_status != CONSUMER_EOF &&
+ cstate->cs_status != CONSUMER_DONE)
+ {
+ result++;
+ elog(LOG, "Consumer %d of producer %s is cancelled", i, squeue->sq_key);
+ cstate->cs_status = CONSUMER_ERROR;
+ /* discard tuples which may already be in the queue */
+ cstate->cs_ntuples = 0;
+ /* keep consistent with cs_ntuples*/
+ cstate->cs_qreadpos = cstate->cs_qwritepos = 0;
+
+ /* wake up consumer if it is sleeping */
+ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ }
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ elog(LOG, "Reset producer %s", squeue->sq_key);
+}
+
+
+/*
+ * Determine if producer can safely pause work.
+ * The producer can pause if all consumers have enough data to read while
+ * producer is sleeping.
+ * Obvoius case when the producer can not pause if at least one queue is empty.
+ */
+bool
+SharedQueueCanPause(SharedQueue squeue)
+{
+ SQueueSync *sqsync = squeue->sq_sync;
+ bool result = true;
+ int usedspace;
+ int ncons;
+ int i;
+
+ usedspace = 0;
+ ncons = 0;
+ for (i = 0; result && (i < squeue->sq_nconsumers); i++)
+ {
+ ConsState *cstate = &(squeue->sq_consumers[i]);
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_SHARED);
+ /*
+ * Count only consumers that may be blocked.
+ * If producer has finished scanning and pushing local buffers some
+ * consumers may be finished already.
+ */
+ if (cstate->cs_status == CONSUMER_ACTIVE)
+ {
+ /* can not pause if some queue is empty */
+ result = (cstate->cs_ntuples > 0);
+ usedspace += (cstate->cs_qwritepos > cstate->cs_qreadpos ?
+ cstate->cs_qwritepos - cstate->cs_qreadpos :
+ cstate->cs_qlength + cstate->cs_qwritepos
+ - cstate->cs_qreadpos);
+ ncons++;
+ }
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ /*
+ * Pause only if average consumer queue is full more then on half.
+ */
+ if (result)
+ result = (usedspace / ncons > squeue->sq_consumers[0].cs_qlength / 2);
+#ifdef SQUEUE_STAT
+ if (result)
+ squeue->stat_paused++;
+#endif
+ return result;
+}
+
+
+int
+SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc,
+ Tuplestorestate **tuplestore)
+{
+ SQueueSync *sqsync = squeue->sq_sync;
+ TupleTableSlot *tmpslot = NULL;
+ int i;
+ int nstores = 0;
+
+ for (i = 0; i < squeue->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &squeue->sq_consumers[i];
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+#ifdef SQUEUE_STAT
+ if (!squeue->stat_finish)
+ elog(LOG, "Finishing %s node %d, %ld writes and %ld reads so far, %ld buffer writes, %ld buffer reads, %ld tuples returned to buffer",
+ squeue->sq_key, cstate->cs_node, cstate->stat_writes, cstate->stat_reads, cstate->stat_buff_writes, cstate->stat_buff_reads, cstate->stat_buff_returns);
+#endif
+ /*
+ * if the tuplestore has data and consumer queue has space for some
+ * try to push rows to the queue. We do not want to do that often
+ * to avoid overhead of temp tuple slot allocation.
+ */
+ if (tuplestore[i])
+ {
+ /* If the consumer is not reading just destroy the tuplestore */
+ if (cstate->cs_status != CONSUMER_ACTIVE)
+ {
+ tuplestore_end(tuplestore[i]);
+ tuplestore[i] = NULL;
+ }
+ else
+ {
+ nstores++;
+ /*
+ * Attempt to dump tuples from the store require tuple slot
+ * allocation, that is not a cheap operation, so proceed if
+ * target queue has enough space.
+ */
+ if (QUEUE_FREE_SPACE(cstate) > cstate->cs_qlength / 2)
+ {
+ if (tmpslot == NULL)
+ tmpslot = MakeSingleTupleTableSlot(tupDesc);
+ if (SharedQueueDump(squeue, i, tmpslot, tuplestore[i]))
+ {
+ tuplestore_end(tuplestore[i]);
+ tuplestore[i] = NULL;
+ cstate->cs_status = CONSUMER_EOF;
+ nstores--;
+ }
+ /* Consumer may be sleeping, wake it up */
+ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ }
+ }
+ }
+ else
+ {
+ /* it set eof if not yet set */
+ if (cstate->cs_status == CONSUMER_ACTIVE)
+ {
+ cstate->cs_status = CONSUMER_EOF;
+ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ }
+ }
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ if (tmpslot)
+ ExecDropSingleTupleTableSlot(tmpslot);
+
+#ifdef SQUEUE_STAT
+ squeue->stat_finish = true;
+#endif
+
+ return nstores;
+}
+
+
+/*
+ * SharedQueueUnBind
+ * Cancel binding of current process to the shared queue. If the process
+ * was a producer it should pass in the array of tuplestores where tuples were
+ * queueed when it was unsafe to block. If any of the tuplestores holds data
+ * rows they are written to the queue. The length of the array of the
+ * tuplestores should be the same as the count of consumers. It is OK if some
+ * entries are NULL. When a consumer unbinds from the shared queue it should
+ * set the tuplestore parameter to NULL.
+ */
+void
+SharedQueueUnBind(SharedQueue squeue)
+{
+ SQueueSync *sqsync = squeue->sq_sync;
+ int wait_result = 0;
+
+ /* loop while there are active consumers */
+ for (;;)
+ {
+ int i;
+ int c_count = 0;
+
+ /* check queue states */
+ for (i = 0; i < squeue->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &squeue->sq_consumers[i];
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock, LW_EXCLUSIVE);
+ /* is consumer working yet ? */
+ if (cstate->cs_status == CONSUMER_ACTIVE)
+ cstate->cs_status = CONSUMER_ERROR;
+ if (cstate->cs_status != CONSUMER_DONE)
+ {
+ c_count++;
+ /* Wake up consumer if it is sleeping */
+ SetLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ /* producer will continue waiting */
+ ResetLatch(&sqsync->sqs_producer_latch);
+ }
+#ifdef SQUEUE_STAT
+ else
+ elog(LOG, "Done %s node %d, %ld writes and %ld reads, %ld buffer writes, %ld buffer reads, %ld tuples returned to buffer",
+ squeue->sq_key, cstate->cs_node, cstate->stat_writes, cstate->stat_reads, cstate->stat_buff_writes, cstate->stat_buff_reads, cstate->stat_buff_returns);
+#endif
+
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ if (c_count == 0)
+ break;
+ elog(LOG, "Wait while %d squeue readers finishing", c_count);
+ /* wait for a notification */
+ wait_result = WaitLatch(&sqsync->sqs_producer_latch,
+ WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_TIMEOUT,
+ 10000L);
+ if (wait_result & WL_TIMEOUT)
+ break;
+ /* got notification, continue loop */
+ }
+#ifdef SQUEUE_STAT
+ elog(LOG, "Producer %s is done, there were %ld pauses", squeue->sq_key, squeue->stat_paused);
+#endif
+
+ LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+ /* All is done, clean up */
+ DisownLatch(&sqsync->sqs_producer_latch);
+
+ /* Now it is OK to remove hash table entry */
+ squeue->sq_sync = NULL;
+ sqsync->queue = NULL;
+ if (hash_search(SharedQueues, squeue->sq_key, HASH_REMOVE, NULL) != squeue)
+ elog(PANIC, "Shared queue data corruption");
+
+ LWLockRelease(SQueuesLock);
+ elog(LOG, "Finalized squeue");
+ if (wait_result & WL_TIMEOUT)
+ elog(FATAL, "Timeout while waiting for Consumers finishing");
+}
+
+
+/*
+ * If queue with specified name still exists set mark respective consumer as
+ * "Done". Due to executor optimization consumer may never connect the queue,
+ * and should allow producer to finish it up if it is known the consumer will
+ * never connect.
+ */
+void
+SharedQueueRelease(const char *sqname)
+{
+ bool found;
+ volatile SharedQueue sq;
+
+ elog(LOG, "Shared Queue release: %s", sqname);
+
+ LWLockAcquire(SQueuesLock, LW_EXCLUSIVE);
+
+ sq = (SharedQueue) hash_search(SharedQueues, sqname, HASH_FIND, &found);
+ if (found)
+ {
+ volatile SQueueSync *sqsync = sq->sq_sync;
+ int myid; /* Node Id of the parent data node */
+ int i;
+ char ntype = PGXC_NODE_DATANODE;
+
+ Assert(sqsync && sqsync->queue == sq);
+
+ /*
+ * Case if the shared queue was never bound.
+ * Just remove it from the hash table.
+ */
+ if (sq->sq_nodeid == -1)
+ {
+ sq->sq_sync = NULL;
+ sqsync->queue = NULL;
+ if (hash_search(SharedQueues, sqname, HASH_REMOVE, NULL) != sq)
+ elog(PANIC, "Shared queue data corruption");
+ elog(LOG, "Finalized squeue %s", sqname);
+ LWLockRelease(SQueuesLock);
+ return;
+ }
+
+ myid = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE, &ntype);
+ /*
+ * Do not bother releasing producer, all necessary work will be
+ * done upon UnBind.
+ */
+ if (sq->sq_nodeid != myid)
+ {
+ elog(LOG, "Looking for consumer %d in %s", myid, sqname);
+ /* find specified node in the consumer lists */
+ for (i = 0; i < sq->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &(sq->sq_consumers[i]);
+ if (cstate->cs_node == myid)
+ {
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ LW_EXCLUSIVE);
+ if (cstate->cs_status != CONSUMER_DONE)
+ {
+ /* Inform producer the consumer have done the job */
+ cstate->cs_status = CONSUMER_DONE;
+ /* no need to receive notifications */
+ if (cstate->cs_pid > 0)
+ {
+ DisownLatch(&sqsync->sqs_consumer_sync[i].cs_latch);
+ cstate->cs_pid = 0;
+ }
+ /*
+ * notify the producer, it may be waiting while
+ * consumers are finishing
+ */
+ SetLatch(&sqsync->sqs_producer_latch);
+ elog(LOG, "Release consumer %d of %s", i, sqname);
+ }
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ /* exit */
+ LWLockRelease(SQueuesLock);
+ return;
+ }
+ }
+ /*
+ * The consumer was never bound. Find empty consumer slot and
+ * register node here to let producer know that the node will never
+ * be consuming.
+ */
+ for (i = 0; i < sq->sq_nconsumers; i++)
+ {
+ ConsState *cstate = &(sq->sq_consumers[i]);
+ if (cstate->cs_node == -1)
+ {
+ LWLockAcquire(sqsync->sqs_consumer_sync[i].cs_lwlock,
+ LW_EXCLUSIVE);
+ /* Inform producer the consumer have done the job */
+ cstate->cs_status = CONSUMER_DONE;
+ SetLatch(&sqsync->sqs_producer_latch);
+ elog(LOG, "Release not bound consumer %d of %s", i, sqname);
+ LWLockRelease(sqsync->sqs_consumer_sync[i].cs_lwlock);
+ }
+ }
+ }
+ }
+ LWLockRelease(SQueuesLock);
+}
+
+
+/*
+ * Called when the backend is ending.
+ */
+void
+SharedQueuesCleanup(int code, Datum arg)
+{
+ /* Need to be able to look into catalogs */
+ CurrentResourceOwner = ResourceOwnerCreate(NULL, "SharedQueuesCleanup");
+
+ /*
+ * Release all registered prepared statements.
+ * If a shared queue name is associated with the statement this queue will
+ * be released.
+ */
+ DropAllPreparedStatements();
+
+ /* Release everything */
+ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, true, true);
+ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_LOCKS, true, true);
+ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_AFTER_LOCKS, true, true);
+ CurrentResourceOwner = NULL;
+}
+
+
+/*
+ * sq_push_long_tuple
+ * Routine to push through the consumer state tuple longer the the consumer
+ * queue. Long tuple is written by a producer partially, and only when the
+ * consumer queue is empty.
+ * The consumer can determine that the tuple being read is long if the length
+ * of the tuple which is read before data is exceeding queue length.
+ * Consumers is switching to the long tuple mode and read in the portion of
+ * data which is already in the queue. After reading in each portion of data
+ * consumer sets cs_ntuples to LONG_TUPLE to indicate it is in long tuple
+ * mode, and writes out number of already read bytes to the beginning of the
+ * queue.
+ * While Consumer is reading in tuple data Producer may work on other task:
+ * execute query and send tuples to other Customers. If Producer sees the
+ * LONG_TUPLE indicator it may write out next portion. The tuple remains
+ * current in the tuplestore, and Producer just needs to read offset from
+ * the buffer to know what part of data to write next.
+ * After tuple is completely written the Producer is advancing to next tuple
+ * and continue operation in normal mode.
+ */
+static bool
+sq_push_long_tuple(ConsState *cstate, RemoteDataRow datarow)
+{
+ if (cstate->cs_ntuples == 0)
+ {
+ /* the tuple is too big to fit the queue, start pushing it through */
+ int len;
+ /*
+ * Output actual message size, to prepare consumer:
+ * allocate memory and set up transmission.
+ */
+ QUEUE_WRITE(cstate, sizeof(int), (char *) &datarow->msglen);
+ /* Output as much as possible */
+ len = cstate->cs_qlength - sizeof(int);
+ Assert(datarow->msglen > len);
+ QUEUE_WRITE(cstate, len, datarow->msg);
+ cstate->cs_ntuples = 1;
+ return false;
+ }
+ else
+ {
+ int offset;
+ int len;
+
+ /* Continue pushing through long tuple */
+ Assert(cstate->cs_ntuples == LONG_TUPLE);
+ /*
+ * Consumer outputs number of bytes already read at the beginning of
+ * the queue.
+ */
+ memcpy(&offset, cstate->cs_qstart, sizeof(int));
+
+ Assert(offset > 0 && offset < datarow->msglen);
+
+ /* remaining data */
+ len = datarow->msglen - offset;
+ /*
+ * We are sending remaining lengs just for sanity check at the consumer
+ * side
+ */
+ QUEUE_WRITE(cstate, sizeof(int), (char *) &len);
+ if (len > cstate->cs_qlength - sizeof(int))
+ {
+ /* does not fit yet */
+ len = cstate->cs_qlength - sizeof(int);
+ QUEUE_WRITE(cstate, len, datarow->msg + offset);
+ cstate->cs_ntuples = 1;
+ return false;
+ }
+ else
+ {
+ /* now we are done */
+ QUEUE_WRITE(cstate, len, datarow->msg + offset);
+ cstate->cs_ntuples = 1;
+ return true;
+ }
+ }
+}
+
+
+/*
+ * sq_pull_long_tuple
+ * Read in from the queue data of a long tuple which does not the queue.
+ * See sq_push_long_tuple for more details
+ */
+static void
+sq_pull_long_tuple(ConsState *cstate, RemoteDataRow datarow,
+ ConsumerSync *sync)
+{
+ int offset = 0;
+ int len = datarow->msglen;
+
+ for (;;)
+ {
+ /* determine how many bytes to read */
+ if (len > cstate->cs_qlength - sizeof(int))
+ len = cstate->cs_qlength - sizeof(int);
+
+ /* read data */
+ QUEUE_READ(cstate, len, datarow->msg + offset);
+
+ /* remember how many we read already */
+ offset += len;
+
+ /* check if we are done */
+ if (offset == datarow->msglen)
+ return;
+
+ /* need more, set up queue to accept data from the producer */
+ Assert(cstate->cs_ntuples == 1); /* allow exactly one incomplete tuple */
+ cstate->cs_ntuples = LONG_TUPLE; /* long tuple mode marker */
+ /* Inform producer how many bytes we have already */
+ memcpy(cstate->cs_qstart, &offset, sizeof(int));
+ /* Release locks and wait until producer supply more data */
+ while (cstate->cs_ntuples == LONG_TUPLE)
+ {
+ /* prepare wait */
+ ResetLatch(&sync->cs_latch);
+ LWLockRelease(sync->cs_lwlock);
+ /* Wait for notification about available info */
+ WaitLatch(&sync->cs_latch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1);
+ /* got the notification, restore lock and try again */
+ LWLockAcquire(sync->cs_lwlock, LW_EXCLUSIVE);
+ }
+ /* Read length of remaining data */
+ QUEUE_READ(cstate, sizeof(int), (char *) &len);
+
+ /* Make sure we are doing the same tuple */
+ Assert(offset + len == datarow->msglen);
+
+ /* next iteration */
+ }
+}
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 1cfac9e80b..6086692f81 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -50,6 +50,11 @@
* there is a window (caused by pgstat delay) on which a worker may choose a
* table that was already vacuumed; this is a bug in the current design.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -79,6 +84,10 @@
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "pgstat.h"
+#ifdef XCP
+#include "pgxc/pgxc.h"
+#include "pgxc/pgxcnode.h"
+#endif
#include "postmaster/autovacuum.h"
#include "postmaster/fork_process.h"
#include "postmaster/postmaster.h"
@@ -2150,6 +2159,16 @@ do_autovacuum(void)
heap_endscan(relScan);
heap_close(classRel, AccessShareLock);
+#ifdef XCP
+ /*
+ * Coordinator needs to access Datanodes to process distributed table.
+ */
+ if (IS_PGXC_COORDINATOR)
+ {
+ InitMultinodeExecutor(false);
+ }
+#endif
+
/*
* Create a buffer access strategy object for VACUUM to use. We want to
* use the same one across all the vacuum operations we perform, since the
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 73d5b2e39c..a085f27d9f 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -11,6 +11,11 @@
* - Add a pgstat config column to pg_database, so this
* entire thing can be enabled/disabled on a per db basis.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Copyright (c) 2001-2012, PostgreSQL Global Development Group
*
* src/backend/postmaster/pgstat.c
@@ -1804,6 +1809,72 @@ pgstat_update_heap_dead_tuples(Relation rel, int delta)
}
+#ifdef XCP
+/*
+ * pgstat_count_remote_insert - count insertion of n tuples on remote Datanodes
+ */
+void
+pgstat_count_remote_insert(Relation rel, int n)
+{
+ /* Should be only applied to distributed table */
+ Assert(rel->rd_locator_info);
+
+ /* For now use the same counters as for heap insert */
+ pgstat_count_heap_insert(rel, n);
+}
+
+
+/*
+ * pgstat_count_remote_update - count update of n tuples on remote Datanodes
+ */
+void
+pgstat_count_remote_update(Relation rel, int n)
+{
+ PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+ /* Should be only applied to distributed table */
+ Assert(rel->rd_locator_info);
+
+ if (pgstat_info != NULL)
+ {
+ /* We have to log the effect at the proper transactional level */
+ int nest_level = GetCurrentTransactionNestLevel();
+
+ if (pgstat_info->trans == NULL ||
+ pgstat_info->trans->nest_level != nest_level)
+ add_tabstat_xact_level(pgstat_info, nest_level);
+
+ pgstat_info->trans->tuples_updated += n;
+ }
+}
+
+
+/*
+ * pgstat_count_remote_delete - count delete of n tuples on remote Datanodes
+ */
+void
+pgstat_count_remote_delete(Relation rel, int n)
+{
+ PgStat_TableStatus *pgstat_info = rel->pgstat_info;
+
+ /* Should be only applied to distributed table */
+ Assert(rel->rd_locator_info);
+
+ if (pgstat_info != NULL)
+ {
+ /* We have to log the effect at the proper transactional level */
+ int nest_level = GetCurrentTransactionNestLevel();
+
+ if (pgstat_info->trans == NULL ||
+ pgstat_info->trans->nest_level != nest_level)
+ add_tabstat_xact_level(pgstat_info, nest_level);
+
+ pgstat_info->trans->tuples_deleted += n;
+ }
+}
+#endif
+
+
/* ----------
* AtEOXact_PgStat
*
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 9069a59fce..4d6972ebaf 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -32,6 +32,11 @@
* clients.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -340,7 +345,11 @@ static DNSServiceRef bonjour_sdref = NULL;
#ifdef PGXC
char *PGXCNodeName = NULL;
+#ifdef XCP
+int PGXCNodeId = 0;
+#else
int PGXCNodeId = -1;
+#endif
/*
* When a particular node starts up, store the node identifier in this variable
* so that we dont have to calculate it OR do a search in cache any where else
@@ -348,8 +357,10 @@ int PGXCNodeId = -1;
*/
uint32 PGXCNodeIdentifier = 0;
+#ifndef XCP
static bool isNodeRegistered = false;
#endif
+#endif
/*
* postmaster.c - function prototypes
@@ -495,11 +506,35 @@ static void ShmemBackendArrayAdd(Backend *bn);
static void ShmemBackendArrayRemove(Backend *bn);
#endif /* EXEC_BACKEND */
+#ifdef XCP
+char *parentPGXCNode = NULL;
+#endif
+
#ifdef PGXC
bool isPGXCCoordinator = false;
bool isPGXCDataNode = false;
+
+/*
+ * While adding a new node to the cluster we need to restore the schema of
+ * an existing database to the new node.
+ * If the new node is a datanode and we connect directly to it,
+ * it does not allow DDL, because it is in read only mode &
+ * If the new node is a coordinator it will send DDLs to all the other
+ * coordinators which we do not want it to do
+ * To provide ability to restore on the new node a new command line
+ * argument is provided called --restoremode
+ * It is to be provided in place of --coordinator OR --datanode.
+ * In restore mode both coordinator and datanode are internally
+ * treated as a datanode.
+ */
+bool isRestoreMode = false;
+
int remoteConnType = REMOTE_CONN_APP;
+/* key pair to be used as object id while using advisory lock for backup */
+Datum xc_lockForBackupKey1;
+Datum xc_lockForBackupKey2;
+
#define StartPoolManager() StartChildProcess(PoolerProcess)
#endif
@@ -740,6 +775,15 @@ PostmasterMain(int argc, char *argv[])
else if (strcmp(name, "datanode") == 0 &&
!value)
isPGXCDataNode = true;
+ else if (strcmp(name, "restoremode") == 0 && !value)
+ {
+ /*
+ * In restore mode both coordinator and datanode
+ * are internally treeated as datanodes
+ */
+ isRestoreMode = true;
+ isPGXCDataNode = true;
+ }
else /* default case */
{
#endif
@@ -777,7 +821,11 @@ PostmasterMain(int argc, char *argv[])
#ifdef PGXC
if (!IS_PGXC_COORDINATOR && !IS_PGXC_DATANODE)
{
- write_stderr("%s: Postgres-XC: must start as either a Coordinator (--coordinator) or Datanode (--datanode)\n",
+#ifdef XCP
+ write_stderr("%s: Postgres-XL: must start as either a Coordinator (--coordinator) or Data Node (--datanode)\n",
+#else
+ write_stderr("%s: Postgres-XC: must start as either a Coordinator (--coordinator) or Data Node (--datanode)\n",
+#endif
progname);
ExitPostmaster(1);
}
@@ -1181,6 +1229,16 @@ PostmasterMain(int argc, char *argv[])
pmState = PM_STARTUP;
#ifdef PGXC /* PGXC_COORD */
+#ifdef XCP
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Initialize the Data Node connection pool
+ */
+ PgPoolerPID = StartPoolManager();
+
+ MemoryContextSwitchTo(oldcontext);
+#else
if (IS_PGXC_COORDINATOR)
{
oldcontext = MemoryContextSwitchTo(TopMemoryContext);
@@ -1192,7 +1250,8 @@ PostmasterMain(int argc, char *argv[])
MemoryContextSwitchTo(oldcontext);
}
-#endif
+#endif /* XCP */
+#endif /* PGXC */
status = ServerLoop();
@@ -1496,11 +1555,15 @@ ServerLoop(void)
if (PgStatPID == 0 && pmState == PM_RUN)
PgStatPID = pgstat_start();
-#ifdef PGXC /* PGXC_COORD */
+#ifdef PGXC
/* If we have lost the pooler, try to start a new one */
+#ifdef XCP
+ if (PgPoolerPID == 0 && pmState == PM_RUN)
+#else
if (IS_PGXC_COORDINATOR && PgPoolerPID == 0 && pmState == PM_RUN)
+#endif /* XCP */
PgPoolerPID = StartPoolManager();
-#endif
+#endif /* PGXC */
/* If we need to signal the autovacuum launcher, do so now */
if (avlauncher_needs_signal)
{
@@ -2147,9 +2210,13 @@ SIGHUP_handler(SIGNAL_ARGS)
if (StartupPID != 0)
signal_child(StartupPID, SIGHUP);
#ifdef PGXC /* PGXC_COORD */
+#ifdef XCP
+ if (PgPoolerPID != 0)
+#else
if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
+#endif /* XCP */
signal_child(PgPoolerPID, SIGHUP);
-#endif
+#endif /* PGXC */
if (BgWriterPID != 0)
signal_child(BgWriterPID, SIGHUP);
if (CheckpointerPID != 0)
@@ -2232,9 +2299,14 @@ pmdie(SIGNAL_ARGS)
#ifdef PGXC /* PGXC_COORD */
/* and the pool manager too */
+#ifdef XCP
+ if (PgPoolerPID != 0)
+#else
if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
+#endif
signal_child(PgPoolerPID, SIGTERM);
+#ifndef XCP
/* Unregister Node on GTM */
if (isNodeRegistered)
{
@@ -2244,6 +2316,7 @@ pmdie(SIGNAL_ARGS)
UnregisterGTM(GTM_NODE_DATANODE);
}
#endif
+#endif
/*
* If we're in recovery, we can't kill the startup process
@@ -2286,6 +2359,11 @@ pmdie(SIGNAL_ARGS)
signal_child(BgWriterPID, SIGTERM);
if (WalReceiverPID != 0)
signal_child(WalReceiverPID, SIGTERM);
+#ifdef XCP
+ /* and the pool manager too */
+ if (PgPoolerPID != 0)
+ signal_child(PgPoolerPID, SIGTERM);
+#endif /* XCP */
if (pmState == PM_RECOVERY)
{
/*
@@ -2312,7 +2390,8 @@ pmdie(SIGNAL_ARGS)
/* and the walwriter too */
if (WalWriterPID != 0)
signal_child(WalWriterPID, SIGTERM);
-#ifdef PGXC /* PGXC_COORD */
+#ifdef PGXC
+#ifndef XCP
/* and the pool manager too */
if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
signal_child(PgPoolerPID, SIGTERM);
@@ -2325,7 +2404,8 @@ pmdie(SIGNAL_ARGS)
else if (IS_PGXC_DATANODE)
UnregisterGTM(GTM_NODE_DATANODE);
}
-#endif
+#endif /* XCP */
+#endif /* PGXC */
pmState = PM_WAIT_BACKENDS;
}
@@ -2350,7 +2430,11 @@ pmdie(SIGNAL_ARGS)
if (StartupPID != 0)
signal_child(StartupPID, SIGQUIT);
#ifdef PGXC /* PGXC_COORD */
+#ifdef XCP
+ if (PgPoolerPID != 0)
+#else
if (IS_PGXC_COORDINATOR && PgPoolerPID != 0)
+#endif /* XCP */
signal_child(PgPoolerPID, SIGQUIT);
#endif
@@ -2515,10 +2599,14 @@ reaper(SIGNAL_ARGS)
PgArchPID = pgarch_start();
if (PgStatPID == 0)
PgStatPID = pgstat_start();
-#ifdef PGXC /* PGXC_COORD */
+#ifdef PGXC
+#ifdef XCP
+ if (PgPoolerPID == 0)
+#else
if (IS_PGXC_COORDINATOR && PgPoolerPID == 0)
+#endif /* XCP */
PgPoolerPID = StartPoolManager();
-#endif
+#endif /* PGXC */
/* at this point we are really open for business */
ereport(LOG,
@@ -2691,7 +2779,11 @@ reaper(SIGNAL_ARGS)
* Was it the pool manager? TODO decide how to handle
* Probably we should restart the system
*/
+#ifdef XCP
+ if (pid == PgPoolerPID)
+#else
if (IS_PGXC_COORDINATOR && pid == PgPoolerPID)
+#endif /* XCP */
{
PgPoolerPID = 0;
if (!EXIT_STATUS_0(exitstatus))
@@ -2932,8 +3024,20 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
signal_child(AutoVacPID, (SendStop ? SIGSTOP : SIGQUIT));
}
-#ifdef PGXC /* PGXC_COORD */
+#ifdef PGXC
/* Take care of the pool manager too */
+#ifdef XCP
+ if (pid == PgPoolerPID)
+ PgPoolerPID = 0;
+ else if (PgPoolerPID != 0 && !FatalError)
+ {
+ ereport(DEBUG2,
+ (errmsg_internal("sending %s to process %d",
+ (SendStop ? "SIGSTOP" : "SIGQUIT"),
+ (int) PgPoolerPID)));
+ signal_child(PgPoolerPID, (SendStop ? SIGSTOP : SIGQUIT));
+ }
+#else
if (IS_PGXC_COORDINATOR)
{
if (pid == PgPoolerPID)
@@ -2947,7 +3051,8 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
signal_child(PgPoolerPID, (SendStop ? SIGSTOP : SIGQUIT));
}
}
-#endif
+#endif /* XCP */
+#endif /* PGXC */
/*
* Force a power-cycle of the pgarch process too. (This isn't absolutely
@@ -3120,7 +3225,7 @@ PostmasterStateMachine(void)
*/
if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 &&
StartupPID == 0 &&
-#ifdef PGXC /* PGXC_COORD */
+#ifdef PGXC
PgPoolerPID == 0 &&
#endif
WalReceiverPID == 0 &&
@@ -3218,7 +3323,7 @@ PostmasterStateMachine(void)
PgArchPID == 0 && PgStatPID == 0)
{
/* These other guys should be dead already */
-#ifdef PGXC /* PGXC_COORD */
+#ifdef PGXC
Assert(PgPoolerPID == 0);
#endif
Assert(StartupPID == 0);
@@ -4430,6 +4535,7 @@ sigusr1_handler(SIGNAL_ARGS)
}
#ifdef PGXC
+#ifndef XCP
/*
* Register node to GTM.
* A node can only be registered if it has reached a stable recovery state
@@ -4475,6 +4581,7 @@ sigusr1_handler(SIGNAL_ARGS)
}
}
#endif
+#endif
if (CheckPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER) &&
PgArchPID != 0)
diff --git a/src/backend/rewrite/rewriteHandler.c b/src/backend/rewrite/rewriteHandler.c
index 4c64d4d0b1..a3d525cdad 100644
--- a/src/backend/rewrite/rewriteHandler.c
+++ b/src/backend/rewrite/rewriteHandler.c
@@ -32,6 +32,7 @@
#include "pgxc/locator.h"
#include "pgxc/nodemgr.h"
#include "pgxc/pgxc.h"
+#include "pgxc/postgresql_fdw.h"
#include "nodes/nodes.h"
#include "optimizer/planner.h"
#include "optimizer/var.h"
@@ -1330,7 +1331,7 @@ rewriteTargetListUD(Query *parsetree, RangeTblEntry *target_rte,
*/
if (IS_PGXC_COORDINATOR &&
!IsConnFromCoord() &&
- !IsLocatorReplicated(GetLocatorType(RelationGetRelid(target_relation))))
+ !IsLocatorReplicated(GetRelationLocType(RelationGetRelid(target_relation))))
{
var = makeVar(parsetree->resultRelation,
XC_NodeIdAttributeNumber,
@@ -2692,7 +2693,7 @@ QueryRewriteCTAS(Query *parsetree)
cparsetree->utilityStmt = (Node *) create_stmt;
initStringInfo(&cquery);
- deparse_query(cparsetree, &cquery, NIL, false, false);
+ deparse_query(cparsetree, &cquery, NIL);
/* Finally, fire off the query to run the DDL */
ProcessUtility(cparsetree->utilityStmt, cquery.data, NULL, true, NULL,
@@ -2707,7 +2708,7 @@ QueryRewriteCTAS(Query *parsetree)
/* Get the SELECT query string */
initStringInfo(&cquery);
- deparse_query((Query *)stmt->query, &cquery, NIL, true, false);
+ deparse_query((Query *)stmt->query, &cquery, NIL);
selectstr = pstrdup(cquery.data);
/* Now, finally build the INSERT INTO statement */
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 78145472e1..6858f1ee80 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3,6 +3,11 @@
* bufmgr.c
* buffer manager interface routines
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -2048,8 +2053,12 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
{
int i;
+#ifdef XCP
+ if (!OidIsValid(MyCoordId) && rnode.backend != InvalidBackendId)
+#else
/* If it's a local relation, it's localbuf.c's problem. */
if (rnode.backend != InvalidBackendId)
+#endif
{
if (rnode.backend == MyBackendId)
DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock);
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 3c89dcad98..78219b8f53 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -3,6 +3,11 @@
* ipci.c
* POSTGRES inter-process communication initialization code.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -40,7 +45,11 @@
#include "storage/procsignal.h"
#include "storage/sinvaladt.h"
#include "storage/spin.h"
-
+#ifdef XCP
+#include "pgxc/pgxc.h"
+#include "pgxc/squeue.h"
+#include "pgxc/pause.h"
+#endif
shmem_startup_hook_type shmem_startup_hook = NULL;
@@ -126,6 +135,12 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
size = add_size(size, AutoVacuumShmemSize());
size = add_size(size, WalSndShmemSize());
size = add_size(size, WalRcvShmemSize());
+#ifdef XCP
+ if (IS_PGXC_DATANODE)
+ size = add_size(size, SharedQueueShmemSize());
+ if (IS_PGXC_COORDINATOR)
+ size = add_size(size, ClusterLockShmemSize());
+#endif
size = add_size(size, BTreeShmemSize());
size = add_size(size, SyncScanShmemSize());
size = add_size(size, AsyncShmemSize());
@@ -236,6 +251,16 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
WalSndShmemInit();
WalRcvShmemInit();
+#ifdef XCP
+ /*
+ * Set up distributed executor's shared queues
+ */
+ if (IS_PGXC_DATANODE)
+ SharedQueuesInit();
+ if (IS_PGXC_COORDINATOR)
+ ClusterLockShmemInit();
+#endif
+
/*
* Set up other modules that need some shared memory space
*/
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index e7f7e6b3ca..34ac658a00 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -18,23 +18,6 @@
* at need by checking for pid == 0.
*
#ifdef PGXC
- * Vanilla PostgreSQL assumes maximum TransactinIds in any snapshot is
- * arrayP->maxProcs. It does not apply to XC because XC's snapshot
- * should include XIDs running in other node, which may come at any
- * time. This means that needed size of xip varies from time to time.
- *
- * This must be handled properly in all the functions in this module.
- *
- * The member max_xcnt was added as SnapshotData member to indicate the
- * real size of xip array.
- *
- * Here, the following assumption is made for SnapshotData struct throughout
- * this module.
- *
- * 1. xip member physical size is indicated by max_xcnt member.
- * 2. If max_xcnt == 0, it means that xip members is NULL, and vise versa.
- * 3. xip (and subxip) are allocated usign malloc() or realloc() directly.
- *
* For Postgres-XC, there is some special handling for ANALYZE.
* An XID for a local ANALYZE command will never involve other nodes.
* Also, ANALYZE may run for a long time, affecting snapshot xmin values
@@ -58,6 +41,11 @@
* happen, it would tie up KnownAssignedXids indefinitely, so we protect
* ourselves by pruning the array when a valid list of running XIDs arrives.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -86,7 +74,6 @@
#include "pgxc/pgxc.h"
#include "access/gtm.h"
#include "storage/ipc.h"
-#include "pgxc/nodemgr.h"
/* PGXC_DATANODE */
#include "postmaster/autovacuum.h"
#endif
@@ -121,9 +108,6 @@ typedef struct ProcArrayStruct
* but actually it is maxProcs entries long.
*/
int pgprocnos[1]; /* VARIABLE LENGTH ARRAY */
-#ifdef PGXC
- int pgAVproxnos[1]; /* VARIABLE LENGTH ARRAY */
-#endif
} ProcArrayStruct;
static ProcArrayStruct *procArray;
@@ -196,10 +180,6 @@ void UnsetGlobalSnapshotData(void);
static bool GetPGXCSnapshotData(Snapshot snapshot);
static bool GetSnapshotDataDataNode(Snapshot snapshot);
static bool GetSnapshotDataCoordinator(Snapshot snapshot);
-static bool resizeXip(Snapshot snapshot, int newsize);
-static bool resizeSubxip(Snapshot snapshot, int newsize);
-static void cleanSnapshot(Snapshot snapshot);
-
/* Global snapshot data */
static SnapshotSource snapshot_source = SNAPSHOT_UNDEFINED;
static int gxmin = InvalidTransactionId;
@@ -253,13 +233,8 @@ ProcArrayShmemSize(void)
* standby in the current run, but we don't know that yet at the time
* shared memory is being set up.
*/
-#if 0 /* Reamins this code for the test to disable KnownAssignedXids in the slave */
-#define TOTAL_MAX_CACHED_SUBXIDS \
- (((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) * (MaxCoords + MaxDataNodes))
-#else
#define TOTAL_MAX_CACHED_SUBXIDS \
((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
-#endif
if (EnableHotStandby)
{
@@ -1345,8 +1320,6 @@ GetSnapshotData(Snapshot snapshot)
int subcount = 0;
bool suboverflowed = false;
- Assert(snapshot != NULL);
-
#ifdef PGXC /* PGXC_DATANODE */
/*
* Obtain a global snapshot for a Postgres-XC session
@@ -1354,18 +1327,22 @@ GetSnapshotData(Snapshot snapshot)
*/
if (GetPGXCSnapshotData(snapshot))
return snapshot;
+#ifdef XCP
/*
- * The codes below run when GetPGXCSnapshotData() couldn't get snapshot from
- * GTM. So no data in snapshot will be used.
+ * Making falling back stricter
*/
- cleanSnapshot(snapshot);
+ if (!snapshot && !RecoveryInProgress() && IsPostmasterEnvironment &&
+ IsNormalProcessingMode() && !IsAutoVacuumLauncherProcess())
+ elog(ERROR, "Was unable to obtain a snapshot from GTM.");
+#else
+#endif
#endif
/*
* Fallback to standard routine, calculate snapshot from local proc arrey
* if no master connection
*/
-
+ Assert(snapshot != NULL);
/*
* Allocating space for maxProcs xids is usually overkill; numProcs would
@@ -1380,10 +1357,6 @@ GetSnapshotData(Snapshot snapshot)
*/
if (snapshot->xip == NULL)
{
-#ifdef PGXC
- resizeXip(snapshot, arrayP->maxProcs);
- resizeSubxip(snapshot, TOTAL_MAX_CACHED_SUBXIDS);
-#else
/*
* First call for this snapshot. Snapshot is same size whether or not
* we are in recovery, see later comments.
@@ -1401,7 +1374,6 @@ GetSnapshotData(Snapshot snapshot)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
-#endif
}
/*
@@ -1478,9 +1450,6 @@ GetSnapshotData(Snapshot snapshot)
continue;
/* Add XID to snapshot. */
-#ifdef PGXC
- resizeXip(snapshot, count + 1);
-#endif
snapshot->xip[count++] = xid;
/*
@@ -2696,12 +2665,12 @@ DisplayXidCache(void)
void
SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip)
{
+ if (gxip)
+ free(gxip);
snapshot_source = SNAPSHOT_COORDINATOR;
gxmin = xmin;
gxmax = xmax;
gxcnt = xcnt;
- if (gxip)
- free(gxip);
gxip = xip;
elog (DEBUG1, "global snapshot info: gxmin: %d, gxmax: %d, gxcnt: %d", gxmin, gxmax, gxcnt);
}
@@ -2712,12 +2681,12 @@ SetGlobalSnapshotData(int xmin, int xmax, int xcnt, int *xip)
void
UnsetGlobalSnapshotData(void)
{
+ if (gxip)
+ free(gxip);
snapshot_source = SNAPSHOT_UNDEFINED;
gxmin = InvalidTransactionId;
gxmax = InvalidTransactionId;
gxcnt = 0;
- if (gxip)
- free(gxip);
gxip = NULL;
elog (DEBUG1, "unset snapshot info");
}
@@ -2745,14 +2714,45 @@ GetPGXCSnapshotData(Snapshot snapshot)
* GTM not to include this transaction ID in snapshot.
* A vacuum worker starts as a normal transaction would.
*/
+#ifdef XCP
+ /* If we got the transaction id from GTM, we should get the snapshot
+ * from there, too
+ */
+ if (IS_PGXC_DATANODE || IsConnFromCoord() || IsAutoVacuumWorkerProcess() || IsXidFromGTM)
+#else
if (IS_PGXC_DATANODE || IsConnFromCoord() || IsAutoVacuumWorkerProcess())
+#endif
{
if (GetSnapshotDataDataNode(snapshot))
return true;
/* else fallthrough */
+ else
+#ifdef XCP
+ {
+ if (IsAutoVacuumLauncherProcess() || !IsNormalProcessingMode() || !IsPostmasterEnvironment)
+ {
+#endif
+ elog(LOG, "Will fall back to local snapshot for XID = %d, source = %d, gxmin = %d, autovac launch = %d, autovac = %d, normProcMode = %d, postEnv = %d",
+ GetCurrentTransactionId(), snapshot_source, gxmin,
+ IsAutoVacuumLauncherProcess(), IsAutoVacuumWorkerProcess(),
+ IsNormalProcessingMode(), IsPostmasterEnvironment);
+#ifdef XCP
+ }
+ else
+ {
+ elog(ERROR, "GTM error, no fallback, could not obtain snapshot. Current XID = %d, Autovac = %d", GetCurrentTransactionId(), IsAutoVacuumWorkerProcess());
+ }
+ }
+#endif
}
else if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && IsNormalProcessingMode())
{
+#ifdef XCP
+ /*
+ * GetSnapshotDataCoordinator will always fail if there is a GTM error.
+ * There is no need for special checking
+ */
+#endif
/* Snapshot has ever been received from remote Coordinator */
if (GetSnapshotDataCoordinator(snapshot))
return true;
@@ -2770,13 +2770,17 @@ GetPGXCSnapshotData(Snapshot snapshot)
* IsNormalProcessingMode() - checks for new connections
* IsAutoVacuumLauncherProcess - checks for autovacuum launcher process
*/
- if (IS_PGXC_DATANODE &&
+ if (IS_PGXC_DATANODE && !isRestoreMode &&
snapshot_source == SNAPSHOT_UNDEFINED &&
IsPostmasterEnvironment &&
IsNormalProcessingMode() &&
!IsAutoVacuumLauncherProcess())
{
+#ifdef XCP
+ elog(ERROR, "Do not have a GTM snapshot available");
+#else
elog(WARNING, "Do not have a GTM snapshot available");
+#endif
}
return false;
@@ -2791,7 +2795,11 @@ GetPGXCSnapshotData(Snapshot snapshot)
static bool
GetSnapshotDataDataNode(Snapshot snapshot)
{
+#ifdef XCP
+ Assert(IS_PGXC_DATANODE || IsConnFromCoord() || IsAutoVacuumWorkerProcess() || IsXidFromGTM);
+#else
Assert(IS_PGXC_DATANODE || IsConnFromCoord() || IsAutoVacuumWorkerProcess());
+#endif
/*
* Fallback to general case if Datanode is accessed directly by an application
@@ -2800,35 +2808,42 @@ GetSnapshotDataDataNode(Snapshot snapshot)
return GetSnapshotDataCoordinator(snapshot);
/* Have a look at cases where Datanode is accessed by cluster internally */
+#ifdef XCP
+ if (IsAutoVacuumWorkerProcess() || GetForceXidFromGTM() || IsAutoVacuumLauncherProcess() || IsXidFromGTM)
+#else
if (IsAutoVacuumWorkerProcess() || GetForceXidFromGTM())
+#endif
{
GTM_Snapshot gtm_snapshot;
bool canbe_grouped = (!FirstSnapshotSet) || (!IsolationUsesXactSnapshot());
elog(DEBUG1, "Getting snapshot for autovacuum. Current XID = %d", GetCurrentTransactionId());
gtm_snapshot = GetSnapshotGTM(GetCurrentTransactionId(), canbe_grouped);
-
if (!gtm_snapshot)
ereport(ERROR,
(errcode(ERRCODE_CONNECTION_FAILURE),
- errmsg("GTM error, could not obtain snapshot")));
+#ifdef XCP
+ errmsg("GTM error, could not obtain snapshot. Current XID = %d, Autovac = %d", GetCurrentTransactionId(), IsAutoVacuumWorkerProcess())));
+#else
+ errmsg("GTM error, could not obtain snapshot.");
+#endif
else {
+ if (gxip)
+ free(gxip);
snapshot_source = SNAPSHOT_DIRECT;
gxmin = gtm_snapshot->sn_xmin;
gxmax = gtm_snapshot->sn_xmax;
gxcnt = gtm_snapshot->sn_xcnt;
RecentGlobalXmin = gtm_snapshot->sn_recent_global_xmin;
- if (gxip)
- free(gxip);
if (gxcnt > 0)
{
- gxip = malloc(gxcnt * 4);
+ gxip = malloc(gxcnt * sizeof(int));
if (gxip == NULL)
{
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
- memcpy(gxip, gtm_snapshot->sn_xip, gxcnt * 4);
+ memcpy(gxip, gtm_snapshot->sn_xip, gxcnt * sizeof(int));
}
else
gxip = NULL;
@@ -2856,8 +2871,38 @@ GetSnapshotDataDataNode(Snapshot snapshot)
* maxProcs does not change at runtime, we can simply reuse the previous
* xip arrays if any. (This relies on the fact that all callers pass
* static SnapshotData structs.) */
- resizeXip(snapshot, Max(arrayP->maxProcs, gxcnt));
- resizeSubxip(snapshot, PGPROC_MAX_CACHED_SUBXIDS);
+ if (snapshot->xip == NULL)
+ {
+ ProcArrayStruct *arrayP = procArray;
+ /*
+ * First call for this snapshot
+ */
+ snapshot->xip = (TransactionId *)
+ malloc(Max(arrayP->maxProcs, gxcnt) * sizeof(TransactionId));
+ if (snapshot->xip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ snapshot->max_xcnt = Max(arrayP->maxProcs, gxcnt);
+
+ Assert(snapshot->subxip == NULL);
+ snapshot->subxip = (TransactionId *)
+ malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+ if (snapshot->subxip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ else if (snapshot->max_xcnt < gxcnt)
+ {
+ snapshot->xip = (TransactionId *)
+ realloc(snapshot->xip, gxcnt * sizeof(TransactionId));
+ if (snapshot->xip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ snapshot->max_xcnt = gxcnt;
+ }
memcpy(snapshot->xip, gxip, gxcnt * sizeof(TransactionId));
snapshot->curcid = GetCurrentCommandId(false);
@@ -2930,7 +2975,17 @@ GetSnapshotDataDataNode(Snapshot snapshot)
continue;
if (proc != MyProc)
{
- resizeXip(snapshot, snapshot->xcnt+1);
+ if (snapshot->xcnt >= snapshot->max_xcnt)
+ {
+ snapshot->max_xcnt += arrayP->numProcs;
+
+ snapshot->xip = (TransactionId *)
+ realloc(snapshot->xip, snapshot->max_xcnt * sizeof(TransactionId));
+ if (snapshot->xip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
snapshot->xip[snapshot->xcnt++] = xid;
elog(DEBUG1, "Adding Analyze for xid %d to snapshot", pgxact->xid);
}
@@ -2978,7 +3033,8 @@ GetSnapshotDataCoordinator(Snapshot snapshot)
if (!gtm_snapshot)
ereport(ERROR,
(errcode(ERRCODE_CONNECTION_FAILURE),
- errmsg("GTM error, could not obtain snapshot")));
+ errmsg("GTM error, could not obtain snapshot XID = %d",
+ GetCurrentTransactionId())));
else
{
snapshot->xmin = gtm_snapshot->sn_xmin;
@@ -2998,10 +3054,44 @@ GetSnapshotDataCoordinator(Snapshot snapshot)
* xip arrays if any. (This relies on the fact that all callers pass
* static SnapshotData structs.)
*/
+ if (snapshot->xip == NULL)
{
ProcArrayStruct *arrayP = procArray;
- resizeXip(snapshot, Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt));
- resizeSubxip(snapshot, PGPROC_MAX_CACHED_SUBXIDS);
+ /*
+ * First call for this snapshot
+ */
+ snapshot->xip = (TransactionId *)
+ malloc(Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt) * sizeof(TransactionId));
+ if (snapshot->xip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ snapshot->max_xcnt = Max(arrayP->maxProcs, gtm_snapshot->sn_xcnt);
+
+ /*
+ * FIXME
+ *
+ * We really don't support subtransaction in PGXC right now, but
+ * when we would, we should fix the allocation below
+ */
+ Assert(snapshot->subxip == NULL);
+ snapshot->subxip = (TransactionId *)
+ malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+
+ if (snapshot->subxip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+ else if (snapshot->max_xcnt < gtm_snapshot->sn_xcnt)
+ {
+ snapshot->xip = (TransactionId *)
+ realloc(snapshot->xip, gtm_snapshot->sn_xcnt * sizeof(TransactionId));
+ if (snapshot->xip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ snapshot->max_xcnt = gtm_snapshot->sn_xcnt;
}
memcpy(snapshot->xip, gtm_snapshot->sn_xip, gtm_snapshot->sn_xcnt * sizeof(TransactionId));
@@ -3031,83 +3121,6 @@ GetSnapshotDataCoordinator(Snapshot snapshot)
}
return false;
}
-
-/*
- * Handlers for xip and subxip member array size, only for XC.
- *
- * Assumes xip is NULL when max_xcnt == 0
- */
-static bool
-resizeXip(Snapshot snapshot, int newsize)
-{
-#define xipResizeUnit (64)
- newsize = ((newsize + xipResizeUnit - 1)/xipResizeUnit)*xipResizeUnit;
-
- if (snapshot->max_xcnt == 0)
- {
- snapshot->xip = malloc(newsize * sizeof(TransactionId));
- if (snapshot->xip == NULL)
- {
- ereport(ERROR,
- (errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of memory")));
- return false;
- }
- snapshot->max_xcnt = newsize;
- snapshot->xcnt = 0;
- return true;
- }
- else if (snapshot->max_xcnt >= newsize)
- return true;
- else
- {
- snapshot->xip = realloc(snapshot->xip, newsize * sizeof(TransactionId));
- if (snapshot->xip == NULL)
- {
- ereport(ERROR,
- (errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of memory")));
- return false;
- }
- snapshot->max_xcnt = newsize;
- return true;
- }
- return false;
-}
-
-/*
- * Because XC does not support subtransaction so far, this function allocates
- * subxip array with the fixes size of TOTAL_MAX_CACHED_SUBXIDS. This is
- * controlled by resizeXip() above.
- * If subxip member is not NULL, it assumes subxip array has TOTAL_MAX_CACHED_SUBXIDS
- * size, regardless what size is specified.
- * This part needs improvement when XC supports subtransaction.
- */
-static bool
-resizeSubxip(Snapshot snapshot, int newsize)
-{
- if (snapshot->subxip)
- return true;
- snapshot->subxip = (TransactionId *)
- malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
- if (snapshot->subxip == NULL)
- {
- ereport(ERROR,
- (errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of memory")));
- return false;
- }
- return true;
-}
-
-/* Cleanup the snapshot */
-static void
-cleanSnapshot(Snapshot snapshot)
-{
- snapshot->xcnt = 0;
- snapshot->subxcnt = 0;
- snapshot->xmin = snapshot->xmax = InvalidTransactionId;
-}
#endif /* PGXC */
/* ----------------------------------------------
@@ -3451,25 +3464,6 @@ static void
KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
bool exclusive_lock)
{
-#ifdef PGXC
- /*
- * Postgres-XC Version 1.0.x supports log shipping replication but not hot standby
- * because hot standby needs to provide consistent database views for all the
- * datanode, which is not available yet.
- *
- * On the other hand, in the slave, current KnownAssignedXids ignores latter half
- * of XLOG_XACT_ASSIGNMENT wal record and registers all the possible XIDs found
- * at the first half of the wal record. Some of them can be missing and such missing
- * Xids remain in the buffer, causing overflow and the slave stops.
- *
- * It will need various change in the code, while the hot standby does not work correctly.
- *
- * For short term solution for Version 1.0.x, it was determined to disable whole hot
- * hot staydby.
- *
- * Hot standby correction will be done in next major release.
- */
-#else
/* use volatile pointer to prevent code rearrangement */
volatile ProcArrayStruct *pArray = procArray;
TransactionId next_xid;
@@ -3574,7 +3568,6 @@ KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
pArray->headKnownAssignedXids = head;
SpinLockRelease(&pArray->known_assigned_xids_lck);
}
-#endif
}
/*
@@ -3744,25 +3737,6 @@ KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
static void
KnownAssignedXidsRemovePreceding(TransactionId removeXid)
{
-#ifdef PGXC
- /*
- * Postgres-XC Version 1.0.x supports log shipping replication but not hot standby
- * because hot standby needs to provide consistent database views for all the
- * datanode, which is not available yet.
- *
- * On the other hand, in the slave, current KnownAssignedXids ignores latter half
- * of XLOG_XACT_ASSIGNMENT wal record and registers all the possible XIDs found
- * at the first half of the wal record. Some of them can be missing and such missing
- * Xids remain in the buffer, causing overflow and the slave stops.
- *
- * It will need various change in the code, while the hot standby does not work correctly.
- *
- * For short term solution for Version 1.0.x, it was determined to disable whole hot
- * hot staydby.
- *
- * Hot standby correction will be done in next major release.
- */
-#else
/* use volatile pointer to prevent code rearrangement */
volatile ProcArrayStruct *pArray = procArray;
int count = 0;
@@ -3828,7 +3802,6 @@ KnownAssignedXidsRemovePreceding(TransactionId removeXid)
/* Opportunistically compress the array */
KnownAssignedXidsCompress(false);
-#endif
}
/*
@@ -3858,26 +3831,6 @@ static int
KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
TransactionId xmax)
{
-#ifdef PGXC
- /*
- * Postgres-XC Version 1.0.x supports log shipping replication but not hot standby
- * because hot standby needs to provide consistent database views for all the
- * datanode, which is not available yet.
- *
- * On the other hand, in the slave, current KnownAssignedXids ignores latter half
- * of XLOG_XACT_ASSIGNMENT wal record and registers all the possible XIDs found
- * at the first half of the wal record. Some of them can be missing and such missing
- * Xids remain in the buffer, causing overflow and the slave stops.
- *
- * It will need various change in the code, while the hot standby does not work correctly.
- *
- * For short term solution for Version 1.0.x, it was determined to disable whole hot
- * hot staydby.
- *
- * Hot standby correction will be done in next major release.
- */
- return 0;
-#else
/* use volatile pointer to prevent code rearrangement */
volatile ProcArrayStruct *pArray = procArray;
int count = 0;
@@ -3928,7 +3881,6 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
}
return count;
-#endif
}
/*
@@ -3938,26 +3890,6 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
static TransactionId
KnownAssignedXidsGetOldestXmin(void)
{
-#ifdef PGXC
- /*
- * Postgres-XC Version 1.0.x supports log shipping replication but not hot standby
- * because hot standby needs to provide consistent database views for all the
- * datanode, which is not available yet.
- *
- * On the other hand, in the slave, current KnownAssignedXids ignores latter half
- * of XLOG_XACT_ASSIGNMENT wal record and registers all the possible XIDs found
- * at the first half of the wal record. Some of them can be missing and such missing
- * Xids remain in the buffer, causing overflow and the slave stops.
- *
- * It will need various change in the code, while the hot standby does not work correctly.
- *
- * For short term solution for Version 1.0.x, it was determined to disable whole hot
- * hot staydby.
- *
- * Hot standby correction will be done in next major release.
- */
- return InvalidTransactionId;
-#else
/* use volatile pointer to prevent code rearrangement */
volatile ProcArrayStruct *pArray = procArray;
int head,
@@ -3980,7 +3912,6 @@ KnownAssignedXidsGetOldestXmin(void)
}
return InvalidTransactionId;
-#endif
}
/*
@@ -4028,6 +3959,120 @@ KnownAssignedXidsDisplay(int trace_level)
pfree(buf.data);
}
+
+#ifdef XCP
+/*
+ * GetGlobalSessionInfo
+ *
+ * Determine the global session id of the specified backend process
+ * Returns coordinator node_id and pid of the initiating coordinator session.
+ * If no such backend or global session id is not defined for the backend
+ * return zero values.
+ */
+void
+GetGlobalSessionInfo(int pid, Oid *coordId, int *coordPid)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ *coordId = InvalidOid;
+ *coordPid = 0;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ /*
+ * Scan processes and get from it info about the parent session
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ volatile PGPROC *proc = &allProcs[arrayP->pgprocnos[index]];
+
+ if (proc->pid == pid)
+ {
+ *coordId = proc->coordId;
+ *coordPid = proc->coordPid;
+ break;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+}
+
+
+/*
+ * GetFirstBackendId
+ *
+ * Determine BackendId of the current process.
+ * The caller must hold the ProcArrayLock and the global session id should
+ * be defined.
+ */
+int
+GetFirstBackendId(int *numBackends, int *backends)
+{
+ ProcArrayStruct *arrayP = procArray;
+ Oid coordId = MyProc->coordId;
+ int coordPid = MyProc->coordPid;
+ int bCount = 0;
+ int bPids[MaxBackends];
+ int index;
+
+ Assert(OidIsValid(coordId));
+
+ /* Scan processes */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ volatile PGPROC *proc = &allProcs[arrayP->pgprocnos[index]];
+
+ /* Skip MyProc */
+ if (proc == MyProc)
+ continue;
+
+ if (proc->coordId == coordId && proc->coordPid == coordPid)
+ {
+ /* BackendId is the same for all backends of the session */
+ if (proc->firstBackendId != InvalidBackendId)
+ return proc->firstBackendId;
+
+ bPids[bCount++] = proc->pid;
+ }
+ }
+
+ if (*numBackends > 0)
+ {
+ int i, j;
+ /*
+ * This is not the first invocation, to prevent endless loop in case
+ * if first backend failed to complete initialization check if all the
+ * processes which were intially found are still here, throw error if
+ * not.
+ */
+ for (i = 0; i < *numBackends; i++)
+ {
+ bool found = false;
+
+ for (j = 0; j < bCount; j++)
+ {
+ if (bPids[j] == backends[i])
+ {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ elog(ERROR, "Failed to determine BackendId for distributed session");
+ }
+ }
+ else
+ {
+ *numBackends = bCount;
+ if (bCount)
+ memcpy(backends, bPids, bCount * sizeof(int));
+ }
+ return InvalidBackendId;
+}
+#endif
+
/*
* KnownAssignedXidsReset
* Resets KnownAssignedXids to be empty
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index 71cbd93efa..9e0ab12c97 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -4,6 +4,11 @@
* Routines for interprocess signalling
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 1ac46b87f9..61220275a3 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -3,6 +3,11 @@
* lock.c
* POSTGRES primary lock mechanism
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -1233,6 +1238,79 @@ LockCheckConflicts(LockMethod lockMethodTable,
return STATUS_OK;
}
+
+#ifdef XCP
+ /*
+ * So the lock is conflicting with locks held by some other backend.
+ * But the backend may belong to the same distributed session. We need to
+ * detect such cases and either allow the lock or throw error, because
+ * waiting for the lock most probably would cause deadlock.
+ */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+ if (proc->coordPid > 0)
+ {
+ /* Count locks held by this process and friends */
+ int myHolding[numLockModes + 1];
+ SHM_QUEUE *procLocks;
+ PROCLOCK *nextplock;
+
+ /* Initialize the counters */
+ for (i = 1; i <= numLockModes; i++)
+ myHolding[i] = 0;
+ otherLocks = 0;
+
+ /* Iterate over processes associated with the lock */
+ procLocks = &(lock->procLocks);
+
+ nextplock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+ offsetof(PROCLOCK, lockLink));
+ while (nextplock)
+ {
+ PGPROC *nextproc = nextplock->tag.myProc;
+
+ if (nextproc->coordPid == proc->coordPid &&
+ nextproc->coordId == proc->coordId)
+ {
+ /*
+ * The process belongs to same distributed session, count locks
+ */
+ myLocks = nextplock->holdMask;
+ for (i = 1; i <= numLockModes; i++)
+ myHolding[i] += ((myLocks & LOCKBIT_ON(i)) ? 1 : 0);
+ }
+ /* get next proclock */
+ nextplock = (PROCLOCK *)
+ SHMQueueNext(procLocks, &nextplock->lockLink,
+ offsetof(PROCLOCK, lockLink));
+ }
+
+ /* Summarize locks held by other processes */
+ for (i = 1; i <= numLockModes; i++)
+ {
+ if (lock->granted[i] > myHolding[i])
+ otherLocks |= LOCKBIT_ON(i);
+ }
+
+ /*
+ * Yet another check.
+ */
+ if (!(lockMethodTable->conflictTab[lockmode] & otherLocks))
+ {
+ LWLockRelease(ProcArrayLock);
+ /* no conflict. OK to get the lock */
+ PROCLOCK_PRINT("LockCheckConflicts: resolved as held by friend",
+ proclock);
+#ifdef LOCK_DEBUG
+ elog(LOG, "Allow lock as held by the same distributed session [%u,%u] %s",
+ lock->tag.locktag_field1, lock->tag.locktag_field2,
+ lockMethodTable->lockModeNames[lockmode]);
+#endif
+ return STATUS_OK;
+ }
+ }
+ LWLockRelease(ProcArrayLock);
+#endif
+
PROCLOCK_PRINT("LockCheckConflicts: conflicting", proclock);
return STATUS_FOUND;
}
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 95d4b37bef..8da345ea0d 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -11,6 +11,11 @@
* LWLocks to protect its shared state.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -31,7 +36,10 @@
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/spin.h"
-
+#ifdef XCP
+#include "pgxc/nodemgr.h"
+#include "pgxc/squeue.h"
+#endif
/* We use the ShmemLock spinlock to protect LWLockAssign */
extern slock_t *ShmemLock;
@@ -201,6 +209,12 @@ NumLWLocks(void)
/* predicate.c needs one per old serializable xid buffer */
numLocks += NUM_OLDSERXID_BUFFERS;
+#ifdef XCP
+ /* squeue.c needs one per consumer node in each shared queue.
+ * Max number of consumers is MaxDataNodes-1 */
+ numLocks += NUM_SQUEUES * (MaxDataNodes-1);
+#endif
+
/*
* Add any requested by loadable modules; for backwards-compatibility
* reasons, allocate at least NUM_USER_DEFINED_LWLOCKS of them even if
@@ -739,6 +753,7 @@ LWLockRelease(LWLockId lockid)
}
if (i < 0)
elog(ERROR, "lock %d is not held", (int) lockid);
+
num_held_lwlocks--;
for (; i < num_held_lwlocks; i++)
held_lwlocks[i] = held_lwlocks[i + 1];
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 16fe9dfb0f..66c021f0a2 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -3,6 +3,11 @@
* proc.c
* routines to manage per-process shared memory data structure
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -357,6 +362,10 @@ InitProcess(void)
MyProc->backendId = InvalidBackendId;
MyProc->databaseId = InvalidOid;
MyProc->roleId = InvalidOid;
+#ifdef XCP
+ MyProc->coordId = InvalidOid;
+ MyProc->coordPid = 0;
+#endif
MyPgXact->inCommit = false;
MyPgXact->vacuumFlags = 0;
#ifdef PGXC
@@ -518,6 +527,10 @@ InitAuxiliaryProcess(void)
MyProc->backendId = InvalidBackendId;
MyProc->databaseId = InvalidOid;
MyProc->roleId = InvalidOid;
+#ifdef XCP
+ MyProc->coordId = InvalidOid;
+ MyProc->coordPid = 0;
+#endif
#ifdef PGXC
MyProc->isPooler = false;
if (IsPGXCPoolerProcess())
diff --git a/src/backend/tcop/dest.c b/src/backend/tcop/dest.c
index c6ab54aa3b..e7fc308e7d 100644
--- a/src/backend/tcop/dest.c
+++ b/src/backend/tcop/dest.c
@@ -4,6 +4,11 @@
* support for communication destinations
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -33,6 +38,9 @@
#include "commands/copy.h"
#include "commands/createas.h"
#include "executor/functions.h"
+#ifdef XCP
+#include "executor/producerReceiver.h"
+#endif
#include "executor/tstoreReceiver.h"
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
@@ -125,6 +133,11 @@ CreateDestReceiver(CommandDest dest)
case DestSQLFunction:
return CreateSQLFunctionDestReceiver();
+
+#ifdef XCP
+ case DestProducer:
+ return CreateProducerDestReceiver();
+#endif
}
/* should never get here */
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index b99320f529..633b69b8db 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3,6 +3,11 @@
* postgres.c
* POSTGRES C Backend Interface
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -89,10 +94,14 @@
/* PGXC_COORD */
#include "pgxc/execRemote.h"
#include "pgxc/barrier.h"
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
#include "nodes/nodes.h"
#include "pgxc/poolmgr.h"
#include "pgxc/pgxcnode.h"
+#ifdef XCP
+#include "pgxc/pause.h"
+#include "pgxc/squeue.h"
+#endif
#include "commands/copy.h"
/* PGXC_DATANODE */
#include "access/transam.h"
@@ -374,10 +383,105 @@ SocketBackend(StringInfo inBuf)
{
int qtype;
+#ifdef XCP
+ /*
+ * Session from data node may need to do some background work if it is
+ * running producing subplans. So just poll the connection, and if it does
+ * not have input for us do the work.
+ * If we do not have producing portals we should use the blocking read
+ * to avoid loop consuming 100% of CPU
+ */
+ if (IS_PGXC_DATANODE && IsConnFromDatanode())
+ {
+ /*
+ * Advance producing portals or poll client connection until we have
+ * a client command to handle.
+ */
+ while (true)
+ {
+ unsigned char c;
+
+ qtype = pq_getbyte_if_available(&c);
+ if (qtype == 0) /* no commands, do producing */
+ {
+ /*
+ * No command yet, try to advance producing portals, and
+ * depending on result do:
+ * -1 No producing portals, block and wait for client command
+ * 0 All producing portals are paused, sleep for a moment and
+ * then check again either we have client command or some
+ * portal is awaken.
+ * 1 check for client command and more continue advancing
+ * producers immediately
+ */
+ int activePortals = -1;
+ ListCell *lc = list_head(getProducingPortals());
+ while (lc)
+ {
+ Portal p = (Portal) lfirst(lc);
+ int result;
+
+ /*
+ * Get next already, because next call may remove cell from
+ * the list and invalidate next reference
+ */
+ lc = lnext(lc);
+
+ result = AdvanceProducingPortal(p, true);
+ if (result == 0)
+ {
+ /* Portal is paused */
+ if (activePortals < 0)
+ activePortals = 0;
+ }
+ else if (result > 0)
+ {
+ if (activePortals < 0)
+ activePortals = result;
+ else
+ activePortals += result;
+ }
+ }
+ if (activePortals < 0)
+ {
+ /* no producers at all, we may wait while next command */
+ qtype = pq_getbyte();
+ break;
+ }
+ else if (activePortals == 0)
+ {
+ /* all producers are paused, sleep a little to allow other
+ * processes to go */
+ pg_usleep(10000L);
+ }
+ }
+ else if (qtype == 1)
+ {
+ /* command code in c is defined, move it to qtype
+ * and break to handle the command */
+ qtype = c;
+ break;
+ }
+ else
+ {
+ /* error, default handling, qtype is already set to EOF */
+ break;
+ }
+ }
+ }
+ else
+ {
+ /*
+ * Get message type code from the frontend.
+ */
+ qtype = pq_getbyte();
+ }
+#else
/*
* Get message type code from the frontend.
*/
qtype = pq_getbyte();
+#endif
if (qtype == EOF) /* frontend disconnected */
{
@@ -449,6 +553,9 @@ SocketBackend(StringInfo inBuf)
break;
case 'B': /* bind */
+#ifdef XCP /* PGXC_DATANODE */
+ case 'p': /* plan */
+#endif
case 'C': /* close */
case 'D': /* describe */
case 'E': /* execute */
@@ -666,6 +773,7 @@ pg_analyze_and_rewrite(Node *parsetree, const char *query_string,
querytree_list = pg_rewrite_query(query);
#ifdef PGXC
+#ifndef XCP
if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
{
ListCell *lc;
@@ -679,6 +787,7 @@ pg_analyze_and_rewrite(Node *parsetree, const char *query_string,
}
}
#endif
+#endif
TRACE_POSTGRESQL_QUERY_REWRITE_DONE(query_string);
@@ -719,6 +828,9 @@ pg_analyze_and_rewrite_params(Node *parsetree,
if (post_parse_analyze_hook)
(*post_parse_analyze_hook) (pstate, query);
+ if (post_parse_analyze_hook)
+ (*post_parse_analyze_hook) (pstate, query);
+
free_parsestate(pstate);
if (log_parser_stats)
@@ -953,6 +1065,37 @@ exec_simple_query(const char *query_string)
*/
parsetree_list = pg_parse_query(query_string);
+#ifdef XCP
+ if (IS_PGXC_COORDINATOR && !IsConnFromCoord() &&
+ list_length(parsetree_list) > 1)
+ {
+ /*
+ * There is a bug in old code, if one query contains multiple utility
+ * statements, entire query may be sent multiple times to the Datanodes
+ * for execution. That is becoming a severe problem, if query contains
+ * COMMIT or ROLLBACK. After executed for the first time the transaction
+ * handling statement would write CLOG entry for current xid, but other
+ * executions would be done with the same xid, causing PANIC on the
+ * Datanodes because of already existing CLOG record. Datanode is
+ * restarting all sessions if it PANICs, and affects all cluster users.
+ * Multiple utility statements may result in strange error messages,
+ * but somteime they work, and used in many applications, so we do not
+ * want to disable them completely, just protect against severe
+ * vulnerability here.
+ */
+ foreach(parsetree_item, parsetree_list)
+ {
+ Node *parsetree = (Node *) lfirst(parsetree_item);
+
+ if (IsTransactionExitStmt(parsetree))
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ errmsg("COMMIT or ROLLBACK "
+ "in multi-statement queries not allowed")));
+ }
+ }
+#endif
+
/* Log immediately if dictated by log_statement */
if (check_log_statement(parsetree_list))
{
@@ -1423,6 +1566,7 @@ exec_parse_message(const char *query_string, /* string to execute */
querytree_list = pg_rewrite_query(query);
#ifdef PGXC
+#ifndef XCP
if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
{
ListCell *lc;
@@ -1436,6 +1580,7 @@ exec_parse_message(const char *query_string, /* string to execute */
}
}
#endif
+#endif
/* Done with the snapshot used for parsing */
if (snapshot_set)
@@ -1534,6 +1679,143 @@ exec_parse_message(const char *query_string, /* string to execute */
debug_query_string = NULL;
}
+#ifdef XCP
+/*
+ * exec_plan_message
+ *
+ * Execute a "Plan" protocol message - already planned statement.
+ */
+static void
+exec_plan_message(const char *query_string, /* source of the query */
+ const char *stmt_name, /* name for prepared stmt */
+ const char *plan_string, /* encoded plan to execute */
+ char **paramTypeNames, /* parameter type names */
+ int numParams) /* number of parameters */
+{
+ MemoryContext oldcontext;
+ bool save_log_statement_stats = log_statement_stats;
+ char msec_str[32];
+ Oid *paramTypes;
+ CachedPlanSource *psrc;
+
+ /* Statement name should not be empty */
+ Assert(stmt_name[0]);
+
+ /*
+ * Report query to various monitoring facilities.
+ */
+ debug_query_string = query_string;
+
+ pgstat_report_activity(STATE_RUNNING, query_string);
+
+ set_ps_display("PLAN", false);
+
+ if (save_log_statement_stats)
+ ResetUsage();
+
+ ereport(DEBUG2,
+ (errmsg("plan %s: %s",
+ *stmt_name ? stmt_name : "<unnamed>",
+ query_string)));
+
+ /*
+ * Start up a transaction command so we can decode plan etc. (Note
+ * that this will normally change current memory context.) Nothing happens
+ * if we are already in one.
+ */
+ start_xact_command();
+
+ /*
+ * XXX
+ * Postgres decides about memory context to use based on "named/unnamed"
+ * assuming named statement is executed multiple times and unnamed is
+ * executed once.
+ * Plan message always provide statement name, but we may use different
+ * criteria, like if plan is referencing "internal" parameters it probably
+ * will be executed multiple times, if not - once.
+ * So far optimize for multiple executions.
+ */
+ /* Named prepared statement --- parse in MessageContext */
+ oldcontext = MemoryContextSwitchTo(MessageContext);
+// unnamed_stmt_context =
+// AllocSetContextCreate(CacheMemoryContext,
+// "unnamed prepared statement",
+// ALLOCSET_DEFAULT_MINSIZE,
+// ALLOCSET_DEFAULT_INITSIZE,
+// ALLOCSET_DEFAULT_MAXSIZE);
+// oldcontext = MemoryContextSwitchTo(unnamed_stmt_context);
+
+ /*
+ * Determine parameter types
+ */
+ if (numParams > 0)
+ {
+ int cnt_param;
+ paramTypes = (Oid *) palloc(numParams * sizeof(Oid));
+ /* we don't expect type mod */
+ for (cnt_param = 0; cnt_param < numParams; cnt_param++)
+ parseTypeString(paramTypeNames[cnt_param], &paramTypes[cnt_param],
+ NULL);
+
+ }
+
+ /* If we got a cancel signal, quit */
+ CHECK_FOR_INTERRUPTS();
+
+ psrc = CreateCachedPlan(NULL, query_string, stmt_name, "REMOTE SUBPLAN");
+
+ CompleteCachedPlan(psrc, NIL, NULL, paramTypes, numParams, NULL, NULL,
+ CURSOR_OPT_GENERIC_PLAN, false);
+
+ /*
+ * Store the query as a prepared statement. See above comments.
+ */
+ StorePreparedStatement(stmt_name, psrc, false);
+
+ SetRemoteSubplan(psrc, plan_string);
+
+ MemoryContextSwitchTo(oldcontext);
+
+ /*
+ * We do NOT close the open transaction command here; that only happens
+ * when the client sends Sync. Instead, do CommandCounterIncrement just
+ * in case something happened during parse/plan.
+ */
+ CommandCounterIncrement();
+
+ /*
+ * Send ParseComplete.
+ */
+ if (whereToSendOutput == DestRemote)
+ pq_putemptymessage('1');
+
+ /*
+ * Emit duration logging if appropriate.
+ */
+ switch (check_log_duration(msec_str, false))
+ {
+ case 1:
+ ereport(LOG,
+ (errmsg("duration: %s ms", msec_str),
+ errhidestmt(true)));
+ break;
+ case 2:
+ ereport(LOG,
+ (errmsg("duration: %s ms parse %s: %s",
+ msec_str,
+ *stmt_name ? stmt_name : "<unnamed>",
+ query_string),
+ errhidestmt(true)));
+ break;
+ }
+
+ if (save_log_statement_stats)
+ ShowUsage("PLAN MESSAGE STATISTICS");
+
+ debug_query_string = NULL;
+}
+#endif
+
/*
* exec_bind_message
*
@@ -2741,6 +3023,14 @@ die(SIGNAL_ARGS)
}
}
+#ifdef XCP
+ /* release cluster lock if holding it */
+ if (cluster_ex_lock_held)
+ {
+ ReleaseClusterLock(true);
+ }
+#endif
+
/* If we're still here, waken anything waiting on the process latch */
if (MyProc)
SetLatch(&MyProc->procLatch);
@@ -3628,7 +3918,12 @@ process_postgres_switches(int argc, char *argv[], GucContext ctx)
{
ereport(FATAL,
(errcode(ERRCODE_SYNTAX_ERROR),
+#ifdef XCP
+ errmsg("Postgres-XL: must start as either a Coordinator (--coordinator) or Datanode (-datanode)\n")));
+#else
errmsg("Postgres-XC: must start as either a Coordinator (--coordinator) or Datanode (-datanode)\n")));
+#endif
+
}
if (!IsPostmasterEnvironment)
{
@@ -3705,10 +4000,17 @@ PostgresMain(int argc, char *argv[], const char *username)
int *xip;
/* Timestamp info */
TimestampTz timestamp;
+#ifndef XCP
PoolHandle *pool_handle;
+#endif
remoteConnType = REMOTE_CONN_APP;
#endif
+#ifdef XCP
+ parentPGXCNode = NULL;
+ cluster_lock_held = false;
+ cluster_ex_lock_held = false;
+#endif /* XCP */
/*
* Initialize globals (already done if under postmaster, but not if
@@ -3960,7 +4262,39 @@ PostgresMain(int argc, char *argv[], const char *username)
if (!IsUnderPostmaster)
PgStartTime = GetCurrentTimestamp();
-#ifdef PGXC /* PGXC_COORD */
+#ifdef PGXC
+ /*
+ * Initialize key pair to be used as object id while using advisory lock
+ * for backup
+ */
+ xc_lockForBackupKey1 = Int32GetDatum(XC_LOCK_FOR_BACKUP_KEY_1);
+ xc_lockForBackupKey1 = Int32GetDatum(XC_LOCK_FOR_BACKUP_KEY_2);
+
+#ifdef XCP
+ if (IsUnderPostmaster)
+ {
+ /*
+ * Prepare to handle distributed requests.
+ * Do that after sending down ReadyForQuery, to avoid pooler
+ * blocking.
+ */
+ start_xact_command();
+ InitMultinodeExecutor(false);
+ finish_xact_command();
+ }
+
+ /* Set up the post parse analyze hook */
+ post_parse_analyze_hook = ParseAnalyze_callback;
+
+ /* if we exit, try to release cluster lock properly */
+ on_shmem_exit(PGXCCleanClusterLock, 0);
+
+ /* if we exit, try to release shared queues */
+ on_shmem_exit(SharedQueuesCleanup, 0);
+
+ /* If we exit, first try and clean connections and send to pool */
+ on_proc_exit(PGXCNodeCleanAndRelease, 0);
+#else
/* If this postmaster is launched from another Coord, do not initialize handles. skip it */
if (IS_PGXC_COORDINATOR && !IsPoolHandle())
{
@@ -3987,6 +4321,7 @@ PostgresMain(int argc, char *argv[], const char *username)
/* If we exit, first try and clean connections and send to pool */
on_proc_exit (PGXCNodeCleanAndRelease, 0);
}
+#endif /* XCP */
if (IS_PGXC_DATANODE)
{
/* If we exit, first try and clean connection to GTM */
@@ -4142,6 +4477,15 @@ PostgresMain(int argc, char *argv[], const char *username)
}
ReadyForQuery(whereToSendOutput);
+#ifdef XCP
+ /*
+ * Before we read any new command we now should wait while all
+ * already closed portals which are still producing finish their
+ * work.
+ */
+ if (IS_PGXC_DATANODE && IsConnFromDatanode())
+ cleanupClosedProducers();
+#endif
#ifdef PGXC
/*
* Helps us catch any problems where we did not send down a snapshot
@@ -4190,6 +4534,24 @@ PostgresMain(int argc, char *argv[], const char *username)
if (ignore_till_sync && firstchar != EOF)
continue;
+#ifdef XCP
+ /*
+ * Acquire the ClusterLock before starting query processing.
+ *
+ * If we are inside a transaction block, this lock will be already held
+ * when the transaction began
+ *
+ * If the session has invoked a PAUSE CLUSTER earlier, then this lock
+ * will be held already in exclusive mode. No need to lock in that case
+ */
+ if (IsUnderPostmaster && IS_PGXC_COORDINATOR && !cluster_ex_lock_held && !cluster_lock_held)
+ {
+ bool exclusive = false;
+ AcquireClusterLock(exclusive);
+ cluster_lock_held = true;
+ }
+#endif /* XCP */
+
switch (firstchar)
{
case 'Q': /* simple query */
@@ -4247,6 +4609,38 @@ PostgresMain(int argc, char *argv[], const char *username)
}
break;
+#ifdef XCP
+ case 'p': /* plan */
+ {
+ const char *stmt_name;
+ const char *query_string;
+ const char *plan_string;
+ int numParams;
+ char **paramTypes = NULL;
+
+ /* Set statement_timestamp() */
+ SetCurrentStatementStartTimestamp();
+
+ stmt_name = pq_getmsgstring(&input_message);
+ query_string = pq_getmsgstring(&input_message);
+ plan_string = pq_getmsgstring(&input_message);
+ numParams = pq_getmsgint(&input_message, 2);
+ paramTypes = (char **)palloc(numParams * sizeof(char *));
+ if (numParams > 0)
+ {
+ int i;
+ for (i = 0; i < numParams; i++)
+ paramTypes[i] = (char *)
+ pq_getmsgstring(&input_message);
+ }
+ pq_getmsgend(&input_message);
+
+ exec_plan_message(query_string, stmt_name, plan_string,
+ paramTypes, numParams);
+ }
+ break;
+#endif
+
case 'B': /* bind */
/* Set statement_timestamp() */
SetCurrentStatementStartTimestamp();
@@ -4463,7 +4857,7 @@ PostgresMain(int argc, char *argv[], const char *username)
if (xcnt > 0)
{
int i;
- xip = malloc(xcnt * 4);
+ xip = malloc(xcnt * sizeof(int));
if (xip == NULL)
{
ereport(ERROR,
@@ -4528,6 +4922,22 @@ PostgresMain(int argc, char *argv[], const char *username)
errmsg("invalid frontend message type %d",
firstchar)));
}
+
+#ifdef XCP
+ /*
+ * If the connection is going idle, release the cluster lock. However
+ * if the session had invoked a PAUSE CLUSTER earlier, then wait for a
+ * subsequent UNPAUSE to release this lock
+ */
+ if (IsUnderPostmaster && IS_PGXC_COORDINATOR && !IsAbortedTransactionBlockState()
+ && !IsTransactionOrTransactionBlock()
+ && cluster_lock_held && !cluster_ex_lock_held)
+ {
+ bool exclusive = false;
+ ReleaseClusterLock(exclusive);
+ cluster_lock_held = false;
+ }
+#endif /* XCP */
} /* end of input-reading loop */
/* can't get here because the above loop never exits */
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 3524410025..afc4d0f774 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -3,6 +3,11 @@
* pquery.c
* POSTGRES process query command code
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -20,9 +25,14 @@
#include "executor/tstoreReceiver.h"
#include "miscadmin.h"
#include "pg_trace.h"
+#ifdef XCP
+#include "catalog/pgxc_node.h"
+#include "executor/producerReceiver.h"
+#include "pgxc/nodemgr.h"
+#endif
#ifdef PGXC
#include "pgxc/pgxc.h"
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
#include "pgxc/execRemote.h"
#include "access/relscan.h"
#endif
@@ -60,7 +70,6 @@ static long DoPortalRunFetch(Portal portal,
DestReceiver *dest);
static void DoPortalRewind(Portal portal);
-
/*
* CreateQueryDesc
*/
@@ -93,6 +102,11 @@ CreateQueryDesc(PlannedStmt *plannedstmt,
qd->planstate = NULL;
qd->totaltime = NULL;
+#ifdef XCP
+ qd->squeue = NULL;
+ qd->myindex = -1;
+#endif
+
return qd;
}
@@ -347,6 +361,11 @@ ChoosePortalStrategy(List *stmts)
{
PlannedStmt *pstmt = (PlannedStmt *) stmt;
+#ifdef XCP
+ if (list_length(pstmt->distributionRestrict) > 1)
+ return PORTAL_DISTRIBUTED;
+#endif
+
if (pstmt->canSetTag)
{
if (pstmt->commandType == CMD_SELECT &&
@@ -530,7 +549,11 @@ PortalStart(Portal portal, ParamListInfo params,
ResourceOwner saveResourceOwner;
MemoryContext savePortalContext;
MemoryContext oldContext;
+#ifdef XCP
+ QueryDesc *queryDesc = NULL;
+#else
QueryDesc *queryDesc;
+#endif
int myeflags;
AssertArg(PortalIsValid(portal));
@@ -563,6 +586,201 @@ PortalStart(Portal portal, ParamListInfo params,
*/
switch (portal->strategy)
{
+#ifdef XCP
+ case PORTAL_DISTRIBUTED:
+ /* No special ability is needed */
+ eflags = 0;
+ /* Must set snapshot before starting executor. */
+ if (use_active_snapshot)
+ PushActiveSnapshot(GetActiveSnapshot());
+ else
+ PushActiveSnapshot(GetTransactionSnapshot());
+
+ /*
+ * Create QueryDesc in portal's context; for the moment, set
+ * the destination to DestNone.
+ */
+ queryDesc = CreateQueryDesc((PlannedStmt *) linitial(portal->stmts),
+ portal->sourceText,
+ GetActiveSnapshot(),
+ InvalidSnapshot,
+ None_Receiver,
+ params,
+ 0);
+ /*
+ * If parent node have sent down parameters, and at least one
+ * of them is PARAM_EXEC we should avoid "single execution"
+ * model. All parent nodes deliver the same values for
+ * PARAM_EXTERN since these values are provided by client and
+ * they are not changed during the query execution.
+ * On the conrary, values of PARAM_EXEC are results of execution
+ * on the parent node and in general diferent parents send to
+ * this node different values and executions are not equivalent.
+ * Since PARAM_EXECs are always at the end of the list we just
+ * need to check last item to figure out if there are any
+ * PARAM_EXECs.
+ * NB: Check queryDesc->plannedstmt->nParamExec > 0 is incorrect
+ * here since queryDesc->plannedstmt->nParamExec may be used
+ * just to allocate space for them and no actual values passed.
+ */
+ if (queryDesc->plannedstmt->nParamRemote > 0 &&
+ queryDesc->plannedstmt->remoteparams[queryDesc->plannedstmt->nParamRemote-1].paramkind == PARAM_EXEC)
+ {
+ int *consMap;
+ int len;
+ int selfid; /* Node Id of the parent data node */
+ char ntype = PGXC_NODE_DATANODE;
+ ListCell *lc;
+ int i;
+ Locator *locator;
+ Oid keytype;
+ DestReceiver *dest;
+
+ len = list_length(queryDesc->plannedstmt->distributionNodes);
+ consMap = (int *) palloc0(len * sizeof(int));
+ queryDesc->squeue = NULL;
+ queryDesc->myindex = -1;
+ selfid = PGXCNodeGetNodeIdFromName(PGXC_PARENT_NODE,
+ &ntype);
+ i = 0;
+ foreach(lc, queryDesc->plannedstmt->distributionNodes)
+ {
+ if (selfid == lfirst_int(lc))
+ consMap[i] = SQ_CONS_SELF;
+ else
+ consMap[i] = SQ_CONS_NONE;
+ i++;
+ }
+ /*
+ * Multiple executions of the RemoteSubplan may lead to name
+ * conflict of SharedQueue, if the subplan has more
+ * RemoteSubplan nodes in the execution plan tree.
+ * We need to make them unique.
+ */
+ RemoteSubplanMakeUnique(
+ (Node *) queryDesc->plannedstmt->planTree,
+ selfid);
+ /*
+ * Call ExecutorStart to prepare the plan for execution
+ */
+ ExecutorStart(queryDesc, eflags);
+
+ /*
+ * Set up locator if result distribution is requested
+ */
+ keytype = queryDesc->plannedstmt->distributionKey == InvalidAttrNumber ?
+ InvalidOid :
+ queryDesc->tupDesc->attrs[queryDesc->plannedstmt->distributionKey-1]->atttypid;
+ locator = createLocator(
+ queryDesc->plannedstmt->distributionType,
+ RELATION_ACCESS_INSERT,
+ keytype,
+ LOCATOR_LIST_INT,
+ len,
+ consMap,
+ NULL,
+ false);
+ dest = CreateDestReceiver(DestProducer);
+ SetProducerDestReceiverParams(dest,
+ queryDesc->plannedstmt->distributionKey,
+ locator, queryDesc->squeue);
+ queryDesc->dest = dest;
+ }
+ else
+ {
+ int *consMap;
+ int len;
+
+ /* Distributed data requested, bind shared queue for data exchange */
+ len = list_length(queryDesc->plannedstmt->distributionNodes);
+ consMap = (int *) palloc(len * sizeof(int));
+ queryDesc->squeue = SharedQueueBind(portal->name,
+ queryDesc->plannedstmt->distributionRestrict,
+ queryDesc->plannedstmt->distributionNodes,
+ &queryDesc->myindex, consMap);
+ if (queryDesc->myindex == -1)
+ {
+ /* producer */
+ Locator *locator;
+ Oid keytype;
+ DestReceiver *dest;
+
+ PG_TRY();
+ {
+ /*
+ * Call ExecutorStart to prepare the plan for execution
+ */
+ ExecutorStart(queryDesc, eflags);
+ }
+ PG_CATCH();
+ {
+ /* Ensure SharedQueue is released */
+ SharedQueueUnBind(queryDesc->squeue);
+ queryDesc->squeue = NULL;
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ /*
+ * This tells PortalCleanup to shut down the executor
+ */
+ portal->queryDesc = queryDesc;
+
+ /*
+ * Set up locator if result distribution is requested
+ */
+ keytype = queryDesc->plannedstmt->distributionKey == InvalidAttrNumber ?
+ InvalidOid :
+ queryDesc->tupDesc->attrs[queryDesc->plannedstmt->distributionKey-1]->atttypid;
+ locator = createLocator(
+ queryDesc->plannedstmt->distributionType,
+ RELATION_ACCESS_INSERT,
+ keytype,
+ LOCATOR_LIST_INT,
+ len,
+ consMap,
+ NULL,
+ false);
+ dest = CreateDestReceiver(DestProducer);
+ SetProducerDestReceiverParams(dest,
+ queryDesc->plannedstmt->distributionKey,
+ locator, queryDesc->squeue);
+ queryDesc->dest = dest;
+
+ addProducingPortal(portal);
+ }
+ else
+ {
+ /*
+ * We do not need to initialize executor, but need
+ * a tuple descriptor
+ */
+ queryDesc->tupDesc = ExecCleanTypeFromTL(
+ queryDesc->plannedstmt->planTree->targetlist,
+ false);
+ }
+ pfree(consMap);
+ }
+
+ portal->queryDesc = queryDesc;
+
+ /*
+ * Remember tuple descriptor (computed by ExecutorStart)
+ */
+ portal->tupDesc = queryDesc->tupDesc;
+
+ /*
+ * Reset cursor position data to "start of query"
+ */
+ portal->atStart = true;
+ portal->atEnd = false; /* allow fetches */
+ portal->portalPos = 0;
+ portal->posOverflow = false;
+
+ PopActiveSnapshot();
+ break;
+#endif
+
case PORTAL_ONE_SELECT:
/* Must set snapshot before starting executor. */
@@ -678,6 +896,17 @@ PortalStart(Portal portal, ParamListInfo params,
/* Uncaught error while executing portal: mark it dead */
MarkPortalFailed(portal);
+#ifdef XCP
+ if (queryDesc && queryDesc->squeue)
+ {
+ /*
+ * Associate the query desc with the portal so it is unbound upon
+ * transaction end.
+ */
+ portal->queryDesc = queryDesc;
+ }
+#endif
+
/* Restore global vars and propagate error */
ActivePortal = saveActivePortal;
CurrentResourceOwner = saveResourceOwner;
@@ -888,6 +1117,175 @@ PortalRun(Portal portal, long count, bool isTopLevel,
result = true;
break;
+#ifdef XCP
+ case PORTAL_DISTRIBUTED:
+ if (count == FETCH_ALL)
+ count = 0;
+ nprocessed = 0;
+
+ if (portal->queryDesc->myindex == -1)
+ {
+ long oldPos;
+
+ if (portal->queryDesc->squeue)
+ {
+ /* Make sure the producer is advancing */
+ while (count == 0 || nprocessed < count)
+ {
+ if (!portal->queryDesc->estate->es_finished)
+ AdvanceProducingPortal(portal, false);
+ /* make read pointer active */
+ tuplestore_select_read_pointer(portal->holdStore, 1);
+ /* perform reads */
+ nprocessed += RunFromStore(portal,
+ ForwardScanDirection,
+ count ? count - nprocessed : 0,
+ dest);
+ /*
+ * Switch back to the write pointer
+ * We do not want to seek if the tuplestore operates
+ * with a file, so copy pointer before.
+ * Also advancing write pointer would allow to free some
+ * memory.
+ */
+ tuplestore_copy_read_pointer(portal->holdStore, 1, 0);
+ tuplestore_select_read_pointer(portal->holdStore, 0);
+ /* try to release occupied memory */
+ tuplestore_trim(portal->holdStore);
+ /* Break if we can not get more rows */
+ if (portal->queryDesc->estate->es_finished)
+ break;
+ }
+ if (nprocessed > 0)
+ portal->atStart = false; /* OK to go backward now */
+ portal->atEnd = portal->queryDesc->estate->es_finished &&
+ tuplestore_ateof(portal->holdStore);
+ oldPos = portal->portalPos;
+ portal->portalPos += nprocessed;
+ /* portalPos doesn't advance when we fall off the end */
+ if (portal->portalPos < oldPos)
+ portal->posOverflow = true;
+ }
+ else
+ {
+ DestReceiver *olddest;
+
+ Assert(portal->queryDesc->dest->mydest == DestProducer);
+ olddest = SetSelfConsumerDestReceiver(
+ portal->queryDesc->dest, dest);
+ /*
+ * Now fetch desired portion of results.
+ */
+ nprocessed = PortalRunSelect(portal, true, count,
+ portal->queryDesc->dest);
+ SetSelfConsumerDestReceiver(
+ portal->queryDesc->dest, olddest);
+ }
+ }
+ else
+ {
+ QueryDesc *queryDesc = portal->queryDesc;
+ SharedQueue squeue = queryDesc->squeue;
+ int myindex = queryDesc->myindex;
+ TupleTableSlot *slot;
+ long oldPos;
+
+ /*
+ * We are the consumer.
+ * We have skipped plan initialization, hence we do not have
+ * a tuple table to get a slot to receive tuples, so prepare
+ * standalone slot.
+ */
+ slot = MakeSingleTupleTableSlot(queryDesc->tupDesc);
+
+ (*dest->rStartup) (dest, CMD_SELECT, queryDesc->tupDesc);
+
+ /*
+ * Loop until we've processed the proper number of tuples
+ * from the plan.
+ */
+ for (;;)
+ {
+ List *producing = getProducingPortals();
+ bool done;
+
+ /*
+ * Obtain a tuple from the queue.
+ * If the session is running producing cursors it is
+ * not safe to wait for available tuple. Two sessions
+ * may deadlock each other. So if session is producing
+ * it should keep advancing producing cursors.
+ */
+ done = SharedQueueRead(squeue, myindex, slot,
+ list_length(producing) == 0);
+
+ /*
+ * if the tuple is null, then we assume there is nothing
+ * more to process so we end the loop...
+ * Also if null tuple is returned the squeue is reset
+ * already, we want to prevent resetting it again
+ */
+ if (TupIsNull(slot))
+ {
+ if (!done && producing)
+ {
+ /* No data to read, advance producing portals */
+ ListCell *lc = list_head(producing);
+ while (lc)
+ {
+ Portal p = (Portal) lfirst(lc);
+ /* Get reference to next entry before
+ * advancing current portal, because the
+ * function may remove current entry from
+ * the list.
+ */
+ lc = lnext(lc);
+
+ AdvanceProducingPortal(p, false);
+ }
+ continue;
+ }
+ else
+ {
+ queryDesc->squeue = NULL;
+ break;
+ }
+ }
+ /*
+ * Send the tuple
+ */
+ (*dest->receiveSlot) (slot, dest);
+
+ /*
+ * increment the number of processed tuples and check count.
+ * If we've processed the proper number then quit, else
+ * loop again and process more tuples. Zero count means
+ * no limit.
+ */
+ if (count && count == ++nprocessed)
+ break;
+ }
+ (*dest->rShutdown) (dest);
+
+ ExecDropSingleTupleTableSlot(slot);
+
+ if (nprocessed > 0)
+ portal->atStart = false; /* OK to go backward now */
+ if (count == 0 ||
+ (unsigned long) nprocessed < (unsigned long) count)
+ portal->atEnd = true; /* we retrieved 'em all */
+ oldPos = portal->portalPos;
+ portal->portalPos += nprocessed;
+ /* portalPos doesn't advance when we fall off the end */
+ if (portal->portalPos < oldPos)
+ portal->posOverflow = true;
+ }
+ /* Mark portal not active */
+ portal->status = PORTAL_READY;
+ result = portal->atEnd;
+ break;
+#endif
+
default:
elog(ERROR, "unrecognized portal strategy: %d",
(int) portal->strategy);
@@ -1010,6 +1408,7 @@ PortalRunSelect(Portal portal,
PushActiveSnapshot(queryDesc->snapshot);
#ifdef PGXC
+#ifndef XCP
if (portal->name != NULL &&
portal->name[0] != '\0' &&
IsA(queryDesc->planstate, RemoteQueryState))
@@ -1035,6 +1434,7 @@ PortalRunSelect(Portal portal,
rqs->cursor = pstrdup(portal->name);
}
#endif
+#endif
ExecutorRun(queryDesc, direction, count);
nprocessed = queryDesc->estate->es_processed;
@@ -1270,7 +1670,13 @@ PortalRunUtility(Portal portal, Node *utilityStmt, bool isTopLevel,
IsA(utilityStmt, NotifyStmt) ||
IsA(utilityStmt, UnlistenStmt) ||
#ifdef PGXC
+#ifdef XCP
+ IsA(utilityStmt, PauseClusterStmt) ||
+ IsA(utilityStmt, BarrierStmt) ||
+ (IsA(utilityStmt, CheckPointStmt) && IS_PGXC_DATANODE)))
+#else
(IsA(utilityStmt, CheckPointStmt) && IS_PGXC_DATANODE)))
+#endif
#else
IsA(utilityStmt, CheckPointStmt)))
#endif
@@ -1795,3 +2201,355 @@ DoPortalRewind(Portal portal)
portal->portalPos = 0;
portal->posOverflow = false;
}
+
+#ifdef XCP
+/*
+ * Execute the specified portal's query and distribute tuples to consumers.
+ * Returs 1 if portal should keep producing, 0 if all consumers have enough
+ * rows in the buffers to pause producing temporarily, -1 if the query is
+ * completed.
+ */
+int
+AdvanceProducingPortal(Portal portal, bool can_wait)
+{
+ Portal saveActivePortal;
+ ResourceOwner saveResourceOwner;
+ MemoryContext savePortalContext;
+ MemoryContext oldContext;
+ QueryDesc *queryDesc;
+ SharedQueue squeue;
+ DestReceiver *treceiver;
+ int result;
+
+ queryDesc = PortalGetQueryDesc(portal);
+ squeue = queryDesc->squeue;
+
+ Assert(queryDesc);
+ /* Make sure the portal is producing */
+ Assert(squeue && queryDesc->myindex == -1);
+ /* Make sure there is proper receiver */
+ Assert(queryDesc->dest && queryDesc->dest->mydest == DestProducer);
+
+ /*
+ * Set up global portal context pointers.
+ */
+ saveActivePortal = ActivePortal;
+ saveResourceOwner = CurrentResourceOwner;
+ savePortalContext = PortalContext;
+ PG_TRY();
+ {
+ ActivePortal = portal;
+ CurrentResourceOwner = portal->resowner;
+ PortalContext = PortalGetHeapMemory(portal);
+
+ oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(portal));
+
+ /*
+ * That is the first pass thru if the hold store is not initialized yet,
+ * Need to initialize stuff.
+ */
+ if (portal->holdStore == NULL && portal->status != PORTAL_FAILED)
+ {
+ int idx;
+ char storename[64];
+
+ PortalCreateProducerStore(portal);
+ treceiver = CreateDestReceiver(DestTuplestore);
+ SetTuplestoreDestReceiverParams(treceiver,
+ portal->holdStore,
+ portal->holdContext,
+ false);
+ SetSelfConsumerDestReceiver(queryDesc->dest, treceiver);
+ SetProducerTempMemory(queryDesc->dest, portal->tmpContext);
+ snprintf(storename, 64, "%s producer store", portal->name);
+ tuplestore_collect_stat(portal->holdStore, storename);
+ /*
+ * Tuplestore does not clear eof flag on the active read pointer,
+ * causing the store is always in EOF state once reached when
+ * there is a single read pointer. We do not want behavior like this
+ * and workaround by using secondary read pointer.
+ * Primary read pointer (0) is active when we are writing to
+ * the tuple store, secondary read pointer is for reading, and its
+ * eof flag is cleared if a tuple is written to the store.
+ * We know the extra read pointer has index 1, so do not store it.
+ */
+ idx = tuplestore_alloc_read_pointer(portal->holdStore, 0);
+ Assert(idx == 1);
+ }
+
+ if (queryDesc->estate && !queryDesc->estate->es_finished &&
+ portal->status != PORTAL_FAILED)
+ {
+ /*
+ * If the portal's hold store has tuples available for read and
+ * all consumer queues are not empty we skip advancing the portal
+ * (pause it) to prevent buffering too many rows at the producer.
+ * NB just created portal store would not be in EOF state, but in
+ * this case consumer queues will be empty and do not allow
+ * erroneous pause. After the first call to AdvanceProducingPortal
+ * portal will try to read the hold store and EOF flag will be set
+ * correctly.
+ */
+ tuplestore_select_read_pointer(portal->holdStore, 1);
+ if (!tuplestore_ateof(portal->holdStore) &&
+ SharedQueueCanPause(squeue))
+ result = 0;
+ else
+ result = 1;
+ tuplestore_select_read_pointer(portal->holdStore, 0);
+
+ if (result)
+ {
+ /* Execute query and dispatch tuples via dest receiver */
+#define PRODUCE_TUPLES 100
+ PushActiveSnapshot(queryDesc->snapshot);
+ ExecutorRun(queryDesc, ForwardScanDirection, PRODUCE_TUPLES);
+ PopActiveSnapshot();
+
+ if (queryDesc->estate->es_processed < PRODUCE_TUPLES)
+ {
+ /*
+ * Finish the executor, but we may still have some tuples
+ * in the local storages.
+ * We should keep trying pushing them into the squeue, so do not
+ * remove the portal from the list of producers.
+ */
+ ExecutorFinish(queryDesc);
+ }
+ }
+ }
+
+ /* Try to dump local tuplestores */
+ if ((queryDesc->estate == NULL || queryDesc->estate->es_finished) &&
+ ProducerReceiverPushBuffers(queryDesc->dest))
+ {
+ if (can_wait && queryDesc->estate == NULL)
+ {
+ (*queryDesc->dest->rDestroy) (queryDesc->dest);
+ queryDesc->dest = NULL;
+ portal->queryDesc = NULL;
+ squeue = NULL;
+
+ removeProducingPortal(portal);
+ FreeQueryDesc(queryDesc);
+
+ /*
+ * Current context is the portal context, which is going
+ * to be deleted
+ */
+ MemoryContextSwitchTo(TopTransactionContext);
+
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+
+ if (portal->resowner)
+ {
+ bool isCommit = (portal->status != PORTAL_FAILED);
+
+ ResourceOwnerRelease(portal->resowner,
+ RESOURCE_RELEASE_BEFORE_LOCKS,
+ isCommit, false);
+ ResourceOwnerRelease(portal->resowner,
+ RESOURCE_RELEASE_LOCKS,
+ isCommit, false);
+ ResourceOwnerRelease(portal->resowner,
+ RESOURCE_RELEASE_AFTER_LOCKS,
+ isCommit, false);
+ ResourceOwnerDelete(portal->resowner);
+ }
+ portal->resowner = NULL;
+
+ /*
+ * Delete tuplestore if present. We should do this even under error
+ * conditions; since the tuplestore would have been using cross-
+ * transaction storage, its temp files need to be explicitly deleted.
+ */
+ if (portal->holdStore)
+ {
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(portal->holdContext);
+ tuplestore_end(portal->holdStore);
+ MemoryContextSwitchTo(oldcontext);
+ portal->holdStore = NULL;
+ }
+
+ /* delete tuplestore storage, if any */
+ if (portal->holdContext)
+ MemoryContextDelete(portal->holdContext);
+
+ /* release subsidiary storage */
+ MemoryContextDelete(PortalGetHeapMemory(portal));
+
+ /* release portal struct (it's in PortalMemory) */
+ pfree(portal);
+ }
+ /* report portal is not producing */
+ result = -1;
+ }
+ else
+ {
+ result = SharedQueueCanPause(queryDesc->squeue) ? 0 : 1;
+ }
+ }
+ PG_CATCH();
+ {
+ /* Uncaught error while executing portal: mark it dead */
+ portal->status = PORTAL_FAILED;
+ /*
+ * Reset producer to allow consumers to finish, so receiving node will
+ * handle the error.
+ */
+ if (squeue)
+ SharedQueueReset(squeue, -1);
+
+ /* Restore global vars and propagate error */
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ MemoryContextSwitchTo(oldContext);
+
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+
+ return result;
+}
+
+
+/*
+ * Iterate over producing portal, determine already closed, and clean them up,
+ * waiting while consumers finish their work. Closed producers should be
+ * cleaned up and resources are released before proceeding with handling of
+ * next request.
+ */
+void
+cleanupClosedProducers(void)
+{
+ ListCell *lc = list_head(getProducingPortals());
+ while (lc)
+ {
+ Portal p = (Portal) lfirst(lc);
+ QueryDesc *queryDesc = PortalGetQueryDesc(p);
+ SharedQueue squeue = queryDesc->squeue;
+
+ /*
+ * Get next already, because next call may remove cell from
+ * the list and invalidate next reference
+ */
+ lc = lnext(lc);
+
+ /* When portal is closed executor state is not set */
+ if (queryDesc->estate == NULL)
+ {
+ /*
+ * Set up global portal context pointers.
+ */
+ Portal saveActivePortal = ActivePortal;
+ ResourceOwner saveResourceOwner = CurrentResourceOwner;
+ MemoryContext savePortalContext = PortalContext;
+
+ PG_TRY();
+ {
+ MemoryContext oldContext;
+ ActivePortal = p;
+ CurrentResourceOwner = p->resowner;
+ PortalContext = PortalGetHeapMemory(p);
+
+ oldContext = MemoryContextSwitchTo(PortalGetHeapMemory(p));
+
+ (*queryDesc->dest->rDestroy) (queryDesc->dest);
+ queryDesc->dest = NULL;
+ p->queryDesc = NULL;
+ squeue = NULL;
+
+ removeProducingPortal(p);
+ FreeQueryDesc(queryDesc);
+
+ /*
+ * Current context is the portal context, which is going
+ * to be deleted
+ */
+ MemoryContextSwitchTo(TopTransactionContext);
+
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+
+ if (p->resowner)
+ {
+ bool isCommit = (p->status != PORTAL_FAILED);
+
+ ResourceOwnerRelease(p->resowner,
+ RESOURCE_RELEASE_BEFORE_LOCKS,
+ isCommit, false);
+ ResourceOwnerRelease(p->resowner,
+ RESOURCE_RELEASE_LOCKS,
+ isCommit, false);
+ ResourceOwnerRelease(p->resowner,
+ RESOURCE_RELEASE_AFTER_LOCKS,
+ isCommit, false);
+ ResourceOwnerDelete(p->resowner);
+ }
+ p->resowner = NULL;
+
+ /*
+ * Delete tuplestore if present. We should do this even under error
+ * conditions; since the tuplestore would have been using cross-
+ * transaction storage, its temp files need to be explicitly deleted.
+ */
+ if (p->holdStore)
+ {
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(p->holdContext);
+ tuplestore_end(p->holdStore);
+ MemoryContextSwitchTo(oldcontext);
+ p->holdStore = NULL;
+ }
+
+ /* delete tuplestore storage, if any */
+ if (p->holdContext)
+ MemoryContextDelete(p->holdContext);
+
+ /* release subsidiary storage */
+ MemoryContextDelete(PortalGetHeapMemory(p));
+
+ /* release portal struct (it's in PortalMemory) */
+ pfree(p);
+
+ MemoryContextSwitchTo(oldContext);
+ }
+ PG_CATCH();
+ {
+ /* Uncaught error while executing portal: mark it dead */
+ p->status = PORTAL_FAILED;
+ /*
+ * Reset producer to allow consumers to finish, so receiving node will
+ * handle the error.
+ */
+ if (squeue)
+ SharedQueueReset(squeue, -1);
+
+ /* Restore global vars and propagate error */
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ ActivePortal = saveActivePortal;
+ CurrentResourceOwner = saveResourceOwner;
+ PortalContext = savePortalContext;
+ }
+ }
+}
+#endif
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index eaf510e5f5..cc3daecd62 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -5,6 +5,11 @@
* commands. At one time acted as an interface between the Lisp and C
* systems.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -67,7 +72,7 @@
#include "pgxc/execRemote.h"
#include "pgxc/locator.h"
#include "pgxc/pgxc.h"
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
#include "pgxc/poolutils.h"
#include "nodes/nodes.h"
#include "pgxc/poolmgr.h"
@@ -75,8 +80,12 @@
#include "pgxc/groupmgr.h"
#include "utils/lsyscache.h"
#include "utils/rel.h"
+#include "utils/builtins.h"
#include "utils/snapmgr.h"
#include "pgxc/xc_maintenance_mode.h"
+#ifdef XCP
+#include "pgxc/pause.h"
+#endif
static void ExecUtilityStmtOnNodes(const char *queryString, ExecNodes *nodes, bool sentToRemote,
bool force_autocommit, RemoteQueryExecType exec_type,
@@ -89,6 +98,7 @@ static RemoteQueryExecType GetNodesForCommentUtility(CommentStmt *stmt, bool *is
static RemoteQueryExecType GetNodesForRulesUtility(RangeVar *relation, bool *is_temp);
static void DropStmtPreTreatment(DropStmt *stmt, const char *queryString, bool sentToRemote,
bool *is_temp, RemoteQueryExecType *exec_type);
+static bool IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString);
static void ExecUtilityWithMessage(const char *queryString, bool sentToRemote, bool is_temp);
#endif
@@ -389,6 +399,39 @@ standard_ProcessUtility(Node *parsetree,
#endif /* PGXC */
char *completionTag)
{
+#ifdef PGXC
+ /*
+ * For more detail see comments in function pgxc_lock_for_backup.
+ *
+ * Cosider the following scenario:
+ * Imagine a two cordinator cluster CO1, CO2
+ * Suppose a client connected to CO1 issues select pgxc_lock_for_backup()
+ * Now assume that a client connected to CO2 issues a create table
+ * select pgxc_lock_for_backup() would try to acquire the advisory lock
+ * in exclusive mode, whereas create table would try to acquire the same
+ * lock in shared mode. Both these requests will always try acquire the
+ * lock in the same order i.e. they would both direct the request first to
+ * CO1 and then to CO2. One of the two requests would therefore pass
+ * and the other would fail.
+ *
+ * Consider another scenario:
+ * Suppose we have a two cooridnator cluster CO1 and CO2
+ * Assume one client connected to each coordinator
+ * Further assume one client starts a transaction
+ * and issues a DDL. This is an unfinished transaction.
+ * Now assume the second client issues
+ * select pgxc_lock_for_backup()
+ * This request would fail because the unfinished transaction
+ * would already hold the advisory lock.
+ */
+ if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && IsNormalProcessingMode())
+ {
+ /* Is the statement a prohibited one? */
+ if (!IsStmtAllowedInLockedMode(parsetree, queryString))
+ pgxc_lock_for_utility_stmt(parsetree);
+ }
+#endif
+
check_xact_readonly(parsetree);
if (completionTag)
@@ -413,6 +456,7 @@ standard_ProcessUtility(Node *parsetree,
case TRANS_STMT_START:
{
ListCell *lc;
+
BeginTransactionBlock();
foreach(lc, stmt->options)
{
@@ -606,8 +650,20 @@ standard_ProcessUtility(Node *parsetree,
#endif
/* Run parse analysis ... */
+#ifdef XCP
+ /*
+ * If sentToRemote is set it is either EXECUTE DIRECT or part
+ * of extencion definition script, that is a kind of extension
+ * specific metadata table. So it makes sense do not distribute
+ * the relation. If someone sure he needs the table distributed
+ * it should explicitly specify distribution.
+ */
+ stmts = transformCreateStmt((CreateStmt *) parsetree,
+ queryString, !sentToRemote);
+#else
stmts = transformCreateStmt((CreateStmt *) parsetree,
queryString);
+#endif
#ifdef PGXC
if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
{
@@ -667,8 +723,12 @@ standard_ProcessUtility(Node *parsetree,
* Coordinator, if not already done so
*/
if (!sentToRemote)
+#ifdef XCP
+ stmts = AddRemoteQueryNode(stmts, queryString, is_temp ? EXEC_ON_DATANODES : EXEC_ON_ALL_NODES);
+#else
stmts = AddRemoteQueryNode(stmts, queryString, EXEC_ON_ALL_NODES, is_temp);
#endif
+#endif
/* ... and do it */
foreach(l, stmts)
@@ -681,15 +741,18 @@ standard_ProcessUtility(Node *parsetree,
static char *validnsps[] = HEAP_RELOPT_NAMESPACES;
#ifdef PGXC
+#ifndef XCP
/* Set temporary object object flag in pooler */
if (is_temp)
PoolManagerSetCommand(POOL_CMD_TEMP, NULL);
#endif
+#endif
/* Create the table itself */
relOid = DefineRelation((CreateStmt *) stmt,
RELKIND_RELATION,
InvalidOid);
+
/*
* Let AlterTableCreateToastTable decide if this one
* needs a secondary relation too.
@@ -702,7 +765,6 @@ standard_ProcessUtility(Node *parsetree,
"toast",
validnsps,
true, false);
-
(void) heap_reloptions(RELKIND_TOASTVALUE, toast_options,
true);
@@ -800,7 +862,11 @@ standard_ProcessUtility(Node *parsetree,
#ifdef PGXC
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+#ifdef XCP
+ errmsg("Postgres-XL does not support FOREIGN DATA WRAPPER yet"),
+#else
errmsg("Postgres-XC does not support FOREIGN DATA WRAPPER yet"),
+#endif
errdetail("The feature is not currently supported")));
#endif
CreateForeignDataWrapper((CreateFdwStmt *) parsetree);
@@ -814,7 +880,11 @@ standard_ProcessUtility(Node *parsetree,
#ifdef PGXC
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+#ifdef XCP
+ errmsg("Postgres-XL does not support SERVER yet"),
+#else
errmsg("Postgres-XC does not support SERVER yet"),
+#endif
errdetail("The feature is not currently supported")));
#endif
CreateForeignServer((CreateForeignServerStmt *) parsetree);
@@ -828,7 +898,11 @@ standard_ProcessUtility(Node *parsetree,
#ifdef PGXC
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+#ifdef XCP
+ errmsg("Postgres-XL does not support USER MAPPING yet"),
+#else
errmsg("Postgres-XC does not support USER MAPPING yet"),
+#endif
errdetail("The feature is not currently supported")));
#endif
CreateUserMapping((CreateUserMappingStmt *) parsetree);
@@ -895,15 +969,34 @@ standard_ProcessUtility(Node *parsetree,
break;
case T_TruncateStmt:
+ ExecuteTruncate((TruncateStmt *) parsetree);
#ifdef PGXC
/*
- * In Postgres-XC, TRUNCATE needs to be launched to remote nodes
- * before AFTER triggers. As this needs an internal control it is
- * managed by this function internally.
+ * Check details of the object being truncated.
+ * If at least one temporary table is truncated truncate cannot use 2PC
+ * at commit.
*/
- ExecuteTruncate((TruncateStmt *) parsetree, queryString);
-#else
- ExecuteTruncate((TruncateStmt *) parsetree);
+ if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
+ {
+ bool is_temp = false;
+ ListCell *cell;
+ TruncateStmt *stmt = (TruncateStmt *) parsetree;
+
+ foreach(cell, stmt->relations)
+ {
+ Oid relid;
+ RangeVar *rel = (RangeVar *) lfirst(cell);
+
+ relid = RangeVarGetRelid(rel, NoLock, false);
+ if (IsTempTable(relid))
+ {
+ is_temp = true;
+ break;
+ }
+ }
+
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_DATANODES, is_temp);
+ }
#endif
break;
@@ -929,6 +1022,7 @@ standard_ProcessUtility(Node *parsetree,
case T_CopyStmt:
{
uint64 processed;
+
processed = DoCopy((CopyStmt *) parsetree, queryString);
if (completionTag)
snprintf(completionTag, COMPLETION_TAG_BUFSIZE,
@@ -1056,7 +1150,7 @@ standard_ProcessUtility(Node *parsetree,
{
AlterTableStmt *atstmt = (AlterTableStmt *) parsetree;
Oid relid;
- List *stmts = NIL;
+ List *stmts;
ListCell *l;
LOCKMODE lockmode;
@@ -1092,7 +1186,11 @@ standard_ProcessUtility(Node *parsetree,
relid,
&is_temp);
+#ifdef XCP
+ stmts = AddRemoteQueryNode(stmts, queryString, exec_type);
+#else
stmts = AddRemoteQueryNode(stmts, queryString, exec_type, is_temp);
+#endif
}
}
#endif
@@ -1367,7 +1465,13 @@ standard_ProcessUtility(Node *parsetree,
#ifdef PGXC
if (IS_PGXC_COORDINATOR)
{
+#ifdef XCP
+ ViewStmt *stmt = (ViewStmt *) parsetree;
+
+ if (stmt->view->relpersistence != RELPERSISTENCE_TEMP)
+#else
if (!ExecIsTempObjectIncluded())
+#endif
ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_COORDS, false);
}
#endif
@@ -1479,9 +1583,11 @@ standard_ProcessUtility(Node *parsetree,
{
bool is_temp = stmt->sequence->relpersistence == RELPERSISTENCE_TEMP;
+#ifndef XCP
/* Set temporary object flag in pooler */
if (is_temp)
PoolManagerSetCommand(POOL_CMD_TEMP, NULL);
+#endif
ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, is_temp);
}
@@ -1572,7 +1678,11 @@ standard_ProcessUtility(Node *parsetree,
/* Clean also remote Coordinators */
sprintf(query, "CLEAN CONNECTION TO ALL FOR DATABASE %s;", stmt->dbname);
+#ifdef XCP
+ ExecUtilityStmtOnNodes(query, NULL, sentToRemote, true, EXEC_ON_ALL_NODES, false);
+#else
ExecUtilityStmtOnNodes(query, NULL, sentToRemote, true, EXEC_ON_COORDS, false);
+#endif
}
#endif
@@ -1651,12 +1761,12 @@ standard_ProcessUtility(Node *parsetree,
/* we choose to allow this during "read only" transactions */
PreventCommandDuringRecovery("VACUUM");
#ifdef PGXC
- /*
- * We have to run the command on nodes before Coordinator because
- * vacuum() pops active snapshot and we can not send it to nodes
- */
- if (IS_PGXC_COORDINATOR)
- ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, true, EXEC_ON_DATANODES, false);
+ /*
+ * We have to run the command on nodes before Coordinator because
+ * vacuum() pops active snapshot and we can not send it to nodes
+ */
+ if (IS_PGXC_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, true, EXEC_ON_DATANODES, false);
#endif
vacuum((VacuumStmt *) parsetree, InvalidOid, true, NULL, false,
isTopLevel);
@@ -1674,6 +1784,7 @@ standard_ProcessUtility(Node *parsetree,
case T_VariableSetStmt:
ExecSetVariableStmt((VariableSetStmt *) parsetree);
#ifdef PGXC
+#ifndef XCP
/* Let the pooler manage the statement */
if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
{
@@ -1697,6 +1808,7 @@ standard_ProcessUtility(Node *parsetree,
}
}
#endif
+#endif
break;
case T_VariableShowStmt:
@@ -1726,25 +1838,18 @@ standard_ProcessUtility(Node *parsetree,
(void) CreateTrigger((CreateTrigStmt *) parsetree, queryString,
InvalidOid, InvalidOid, false);
#ifdef PGXC
- if (IS_PGXC_COORDINATOR)
- {
- CreateTrigStmt *stmt = (CreateTrigStmt *) parsetree;
- RemoteQueryExecType exec_type;
- bool is_temp;
-
- /* Postgres-XC does not support yet FOR EACH ROW yet */
- if (stmt->row)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("Postgres-XC does not support ROW TRIGGER yet"),
- errdetail("The feature is not currently supported")));
-
- exec_type = ExecUtilityFindNodes(OBJECT_TABLE,
- RangeVarGetRelid(stmt->relation, NoLock, false),
- &is_temp);
+ /* Postgres-XC does not support yet triggers */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+#ifdef XCP
+ errmsg("Postgres-XL does not support TRIGGER yet"),
+#else
+ errmsg("Postgres-XC does not support TRIGGER yet"),
+#endif
+ errdetail("The feature is not currently supported")));
- ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, exec_type, is_temp);
- }
+ if (IS_PGXC_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
#endif
break;
@@ -1835,6 +1940,18 @@ standard_ProcessUtility(Node *parsetree,
case T_ConstraintsSetStmt:
AfterTriggerSetState((ConstraintsSetStmt *) parsetree);
#ifdef PGXC
+#ifdef XCP
+ /*
+ * Just send statement to all the datanodes. It is effectively noop
+ * if no transaction, because transaction will be committed and
+ * changes will be cleared after completion.
+ * Side effect of that command is that session takes a connection
+ * to each Datanode and holds it while transaction lasts, even if
+ * subsequent statements won't use some of them.
+ */
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false,
+ EXEC_ON_DATANODES, false);
+#else
/*
* Let the pooler manage the statement, SET CONSTRAINT can just be used
* inside a transaction block, hence it has no effect outside that, so use
@@ -1846,6 +1963,7 @@ standard_ProcessUtility(Node *parsetree,
elog(ERROR, "Postgres-XC: ERROR SET query");
}
#endif
+#endif
break;
case T_CheckPointStmt:
@@ -1872,6 +1990,12 @@ standard_ProcessUtility(Node *parsetree,
case T_BarrierStmt:
RequestBarrier(((BarrierStmt *) parsetree)->id, completionTag);
break;
+#ifdef XCP
+ case T_PauseClusterStmt:
+ RequestClusterPause(((PauseClusterStmt *) parsetree)->pause, completionTag);
+ break;
+#endif
+
/*
* Node DDL is an operation local to Coordinator.
@@ -1880,6 +2004,10 @@ standard_ProcessUtility(Node *parsetree,
*/
case T_AlterNodeStmt:
PgxcNodeAlter((AlterNodeStmt *) parsetree);
+#ifdef XCP
+ if (((AlterNodeStmt *) parsetree)->cluster)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, false, EXEC_ON_ALL_NODES, false);
+#endif
break;
case T_CreateNodeStmt:
@@ -2007,11 +2135,21 @@ standard_ProcessUtility(Node *parsetree,
break;
case T_CleanConnStmt:
- Assert(IS_PGXC_COORDINATOR);
+#ifdef XCP
+ /*
+ * First send command to other nodes via probably existing
+ * connections, then clean local pooler
+ */
+ if (IS_PGXC_COORDINATOR)
+ ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, true, EXEC_ON_ALL_NODES, false);
+ CleanConnection((CleanConnStmt *) parsetree);
+#else
+ Assert(IS_PGXC_COORDINATOR);
CleanConnection((CleanConnStmt *) parsetree);
if (IS_PGXC_COORDINATOR)
ExecUtilityStmtOnNodes(queryString, NULL, sentToRemote, true, EXEC_ON_COORDS, false);
+#endif
break;
#endif
default:
@@ -2093,7 +2231,9 @@ ExecUtilityStmtOnNodes(const char *queryString, ExecNodes *nodes, bool sentToRem
step->sql_statement = pstrdup(queryString);
step->force_autocommit = force_autocommit;
step->exec_type = exec_type;
+#ifndef XCP
step->is_temp = is_temp;
+#endif
ExecRemoteUtility(step);
pfree(step->sql_statement);
pfree(step);
@@ -2124,9 +2264,7 @@ ExecUtilityFindNodes(ObjectType object_type,
exec_type = EXEC_ON_ALL_NODES;
break;
- /* Triggers are evaluated based on the relation they are defined on */
case OBJECT_TABLE:
- case OBJECT_TRIGGER:
/* Do the check on relation kind */
exec_type = ExecUtilityFindNodesRelkind(object_id, is_temp);
break;
@@ -2176,13 +2314,21 @@ ExecUtilityFindNodesRelkind(Oid relid, bool *is_temp)
switch (relkind_str)
{
case RELKIND_SEQUENCE:
+#ifndef XCP
*is_temp = IsTempTable(relid);
exec_type = EXEC_ON_ALL_NODES;
break;
-
+#endif
case RELKIND_RELATION:
+#ifdef XCP
+ if ((*is_temp = IsTempTable(relid)))
+ exec_type = EXEC_ON_DATANODES;
+ else
+ exec_type = EXEC_ON_ALL_NODES;
+#else
*is_temp = IsTempTable(relid);
exec_type = EXEC_ON_ALL_NODES;
+#endif
break;
case RELKIND_VIEW:
@@ -3050,6 +3196,12 @@ CreateCommandTag(Node *parsetree)
case T_DropGroupStmt:
tag = "DROP NODE GROUP";
break;
+
+#ifdef XCP
+ case T_PauseClusterStmt:
+ tag = "PAUSE/UNPAUSE CLUSTER";
+ break;
+#endif
#endif
case T_ReindexStmt:
@@ -3202,12 +3354,14 @@ CreateCommandTag(Node *parsetree)
}
break;
+#ifdef PGXC
case T_ExecDirectStmt:
tag = "EXECUTE DIRECT";
break;
case T_CleanConnStmt:
tag = "CLEAN CONNECTION";
break;
+#endif
default:
elog(WARNING, "unrecognized node type: %d",
@@ -3651,7 +3805,18 @@ GetCommandLogLevel(Node *parsetree)
lev = LOGSTMT_DDL;
break;
#endif
-
+#ifdef XCP
+ case T_AlterNodeStmt:
+ case T_CreateNodeStmt:
+ case T_DropNodeStmt:
+ case T_CreateGroupStmt:
+ case T_DropGroupStmt:
+ lev = LOGSTMT_DDL;
+ break;
+ case T_ExecDirectStmt:
+ lev = LOGSTMT_ALL;
+ break;
+#endif
default:
elog(WARNING, "unrecognized node type: %d",
(int) nodeTag(parsetree));
@@ -3664,6 +3829,97 @@ GetCommandLogLevel(Node *parsetree)
#ifdef PGXC
/*
+ * IsStmtAllowedInLockedMode
+ *
+ * Allow/Disallow a utility command while cluster is locked
+ * A statement will be disallowed if it makes such changes
+ * in catalog that are backed up by pg_dump except
+ * CREATE NODE that has to be allowed because
+ * a new node has to be created while the cluster is still
+ * locked for backup
+ */
+static bool
+IsStmtAllowedInLockedMode(Node *parsetree, const char *queryString)
+{
+#define ALLOW 1
+#define DISALLOW 0
+
+ switch (nodeTag(parsetree))
+ {
+ /* To allow creation of temp tables */
+ case T_CreateStmt: /* CREATE TABLE */
+ {
+ CreateStmt *stmt = (CreateStmt *) parsetree;
+ if (stmt->relation->relpersistence == RELPERSISTENCE_TEMP)
+ return ALLOW;
+ return DISALLOW;
+ }
+ break;
+
+ case T_ExecuteStmt: /*
+ * Prepared statememts can only have
+ * SELECT, INSERT, UPDATE, DELETE,
+ * or VALUES statement, there is no
+ * point stopping EXECUTE.
+ */
+ case T_CreateNodeStmt: /*
+ * This has to be allowed so that the new node
+ * can be created, while the cluster is still
+ * locked for backup
+ */
+ case T_DropNodeStmt: /*
+ * This has to be allowed so that DROP NODE
+ * can be issued to drop a node that has crashed.
+ * Otherwise system would try to acquire a shared
+ * advisory lock on the crashed node.
+ */
+
+ case T_TransactionStmt:
+ case T_PlannedStmt:
+ case T_ClosePortalStmt:
+ case T_FetchStmt:
+ case T_TruncateStmt:
+ case T_CopyStmt:
+ case T_PrepareStmt: /*
+ * Prepared statememts can only have
+ * SELECT, INSERT, UPDATE, DELETE,
+ * or VALUES statement, there is no
+ * point stopping PREPARE.
+ */
+ case T_DeallocateStmt: /*
+ * If prepare is allowed the deallocate should
+ * be allowed also
+ */
+ case T_DoStmt:
+ case T_NotifyStmt:
+ case T_ListenStmt:
+ case T_UnlistenStmt:
+ case T_LoadStmt:
+ case T_ClusterStmt:
+ case T_VacuumStmt:
+ case T_ExplainStmt:
+ case T_VariableSetStmt:
+ case T_VariableShowStmt:
+ case T_DiscardStmt:
+ case T_LockStmt:
+ case T_ConstraintsSetStmt:
+ case T_CheckPointStmt:
+ case T_BarrierStmt:
+ case T_ReindexStmt:
+ case T_RemoteQuery:
+ case T_CleanConnStmt:
+#ifdef XCP
+ case T_PauseClusterStmt:
+#endif
+ return ALLOW;
+
+ default:
+ return DISALLOW;
+ }
+ return DISALLOW;
+}
+
+/*
* GetCommentObjectId
* TODO Change to return the nodes to execute the utility on
*
@@ -3831,18 +4087,17 @@ DropStmtPreTreatment(DropStmt *stmt, const char *queryString, bool sentToRemote,
}
break;
- /*
- * Those objects are dropped depending on the nature of the relationss
- * they are defined on. This evaluation uses the temporary behavior
- * and the relkind of the relation used.
- */
case OBJECT_RULE:
- case OBJECT_TRIGGER:
{
+ /*
+ * In the case of a rule we need to find the object on
+ * which the rule is dependent and define if this rule
+ * has a dependency with a temporary object or not.
+ */
List *objname = linitial(stmt->objects);
Relation relation = NULL;
- get_object_address(stmt->removeType,
+ get_object_address(OBJECT_RULE,
objname, NIL,
&relation,
AccessExclusiveLock,
@@ -3850,7 +4105,7 @@ DropStmtPreTreatment(DropStmt *stmt, const char *queryString, bool sentToRemote,
/* Do nothing if no relation */
if (relation && OidIsValid(relation->rd_id))
- res_exec_type = ExecUtilityFindNodes(stmt->removeType,
+ res_exec_type = ExecUtilityFindNodes(OBJECT_RULE,
relation->rd_id,
&res_is_temp);
else
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index 5582a06c7f..e33e33f67a 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -3,6 +3,11 @@
* arrayfuncs.c
* Support functions for arrays.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -161,6 +166,40 @@ array_in(PG_FUNCTION_ARGS)
lBound[MAXDIM];
ArrayMetaState *my_extra;
+#ifdef XCP
+ /* Make a modifiable copy of the input */
+ string_save = pstrdup(string);
+ if (*string_save == '(')
+ {
+ /*
+ * String representation contains prefix defining data type of array
+ * elements, if array has been output as anyarray.
+ */
+ char *typnspname;
+ char *typname;
+
+ /* Type namespace is started after '(' and terminated by a '.' */
+ typnspname = string_save + 1;
+ for (p = typnspname; *p != '.'; p++)
+ if (*p == ')' || *p == '\0') /* dot not found */
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+ errmsg("invalid element type definition")));
+ /* it is OK to modify the copy */
+ *p = '\0';
+ typname = p + 1;
+ for (p = typname; *p != ')'; p++)
+ if (*p == '\0') /* closing paren not found */
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+ errmsg("invalid element type definition")));
+ *p = '\0';
+ p++;
+ element_type = get_typname_typid(typname, get_namespaceid(typnspname));
+ }
+ else
+ p = string_save;
+#endif
/*
* We arrange to look up info about element type, including its input
* conversion proc, only once per series of calls, assuming the element
@@ -194,6 +233,7 @@ array_in(PG_FUNCTION_ARGS)
typdelim = my_extra->typdelim;
typioparam = my_extra->typioparam;
+#ifndef XCP
/* Make a modifiable copy of the input */
string_save = pstrdup(string);
@@ -206,6 +246,7 @@ array_in(PG_FUNCTION_ARGS)
* outer loop iterates once per dimension item.
*/
p = string_save;
+#endif
ndim = 0;
for (;;)
{
diff --git a/src/backend/utils/adt/date.c b/src/backend/utils/adt/date.c
index 6e29ebb784..03024cc242 100644
--- a/src/backend/utils/adt/date.c
+++ b/src/backend/utils/adt/date.c
@@ -3,6 +3,11 @@
* date.c
* implements DATE and TIME data types specified in SQL-92 standard
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994-5, Regents of the University of California
*
@@ -24,6 +29,9 @@
#include "libpq/pqformat.h"
#include "miscadmin.h"
#include "parser/scansup.h"
+#ifdef XCP
+#include "pgxc/pgxc.h"
+#endif
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/date.h"
@@ -191,7 +199,15 @@ date_out(PG_FUNCTION_ARGS)
{
j2date(date + POSTGRES_EPOCH_JDATE,
&(tm->tm_year), &(tm->tm_mon), &(tm->tm_mday));
+#ifdef XCP
+ /*
+ * We want other nodes could parse encoded dates correctly.
+ * ISO date style is best suitable for that
+ */
+ EncodeDateOnly(tm, IS_PGXC_DATANODE ? USE_ISO_DATES : DateStyle, buf);
+#else
EncodeDateOnly(tm, DateStyle, buf);
+#endif
}
result = pstrdup(buf);
diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c
index 78fc657207..35e171017e 100644
--- a/src/backend/utils/adt/dbsize.c
+++ b/src/backend/utils/adt/dbsize.c
@@ -2,6 +2,11 @@
* dbsize.c
* Database object size functions, and related inquiries
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Copyright (c) 2002-2012, PostgreSQL Global Development Group
*
* IDENTIFICATION
@@ -36,6 +41,14 @@
#include "utils/relmapper.h"
#include "utils/lsyscache.h"
#include "utils/syscache.h"
+#ifdef XCP
+#include "catalog/pg_type.h"
+#include "catalog/pgxc_node.h"
+#include "executor/executor.h"
+#include "nodes/makefuncs.h"
+#include "pgxc/execRemote.h"
+#include "utils/snapmgr.h"
+#endif
#ifdef PGXC
static Datum pgxc_database_size(Oid dbOid);
@@ -885,7 +898,11 @@ pg_relation_filepath(PG_FUNCTION_ARGS)
break;
case RELPERSISTENCE_TEMP:
if (isTempOrToastNamespace(relform->relnamespace))
+#ifdef XCP
+ backend = OidIsValid(MyCoordId) ? InvalidBackendId : MyBackendId;
+#else
backend = MyBackendId;
+#endif
else
{
/* Do it the hard way. */
@@ -971,16 +988,72 @@ pgxc_database_size(Oid dbOid)
Datum
pgxc_execute_on_nodes(int numnodes, Oid *nodelist, char *query)
{
+#ifndef XCP
StringInfoData buf;
int ret;
TupleDesc spi_tupdesc;
+#endif
int i;
int64 total_size = 0;
int64 size = 0;
+#ifndef XCP
bool isnull;
char *nodename;
+#endif
Datum datum;
+#ifdef XCP
+ EState *estate;
+ MemoryContext oldcontext;
+ RemoteQuery *plan;
+ RemoteQueryState *pstate;
+ TupleTableSlot *result;
+ Var *dummy;
+
+ /*
+ * Make up RemoteQuery plan node
+ */
+ plan = makeNode(RemoteQuery);
+ plan->combine_type = COMBINE_TYPE_NONE;
+ plan->exec_nodes = makeNode(ExecNodes);
+ for (i = 0; i < numnodes; i++)
+ {
+ char ntype = PGXC_NODE_NONE;
+ plan->exec_nodes->nodeList = lappend_int(plan->exec_nodes->nodeList,
+ PGXCNodeGetNodeId(nodelist[i], &ntype));
+ if (ntype == PGXC_NODE_NONE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Unknown node Oid: %u", nodelist[i])));
+ }
+ plan->sql_statement = query;
+ plan->force_autocommit = false;
+ plan->exec_type = EXEC_ON_DATANODES;
+ /*
+ * We only need the target entry to determine result data type.
+ * So create dummy even if real expression is a function.
+ */
+ dummy = makeVar(1, 1, INT8OID, 0, InvalidOid, 0);
+ plan->scan.plan.targetlist = lappend(plan->scan.plan.targetlist,
+ makeTargetEntry((Expr *) dummy, 1, NULL, false));
+ /* prepare to execute */
+ estate = CreateExecutorState();
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+ estate->es_snapshot = GetActiveSnapshot();
+ pstate = ExecInitRemoteQuery(plan, estate, 0);
+ MemoryContextSwitchTo(oldcontext);
+
+ result = ExecRemoteQuery(pstate);
+ while (!TupIsNull(result))
+ {
+ bool isnull;
+ datum = slot_getattr(result, 1, &isnull);
+ size = DatumGetInt64(datum);
+ total_size += size;
+ result = ExecRemoteQuery(pstate);
+ }
+ ExecEndRemoteQuery(pstate);
+#else
/*
* Connect to SPI manager
*/
@@ -1022,6 +1095,7 @@ pgxc_execute_on_nodes(int numnodes, Oid *nodelist, char *query)
}
SPI_finish();
+#endif
if (numnodes == 1)
PG_RETURN_DATUM(datum);
diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c
index 21d26def79..1c05301c7c 100644
--- a/src/backend/utils/adt/lockfuncs.c
+++ b/src/backend/utils/adt/lockfuncs.c
@@ -19,6 +19,8 @@
#include "pgxc/pgxc.h"
#include "pgxc/pgxcnode.h"
#include "pgxc/nodemgr.h"
+#include "executor/spi.h"
+#include "tcop/utility.h"
#endif
#include "storage/predicate_internals.h"
#include "utils/builtins.h"
@@ -1061,3 +1063,128 @@ pg_advisory_unlock_all(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
+
+#ifdef PGXC
+/*
+ * pgxc_lock_for_backup
+ *
+ * Lock the cluster for taking backup
+ * To lock the cluster, try to acquire a session level advisory lock exclusivly
+ * By lock we mean to disallow any statements that change
+ * the portions of the catalog which are backed up by pg_dump/pg_dumpall
+ * Returns true or fails with an error message.
+ */
+Datum
+pgxc_lock_for_backup(PG_FUNCTION_ARGS)
+{
+ bool lockAcquired = false;
+ int prepared_xact_count = 0;
+
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("only superuser can lock the cluster for backup")));
+
+ /*
+ * The system cannot be locked for backup if there is an uncommitted
+ * prepared transaction, the reason is as follows:
+ * Utility statements are divided into two groups, one is allowed group
+ * and the other is disallowed group. A statement is put in allowed group
+ * if it does not make changes to the catalog or makes such changes which
+ * are not backed up by pg_dump or pg_dumpall, otherwise it is put in
+ * disallowed group. Every time a disallowed statement is issued we try to
+ * hold an advisory lock in shared mode and if the lock can be acquired
+ * only then the statement is allowed.
+ * In case of prepared transactions suppose the lock is not released at
+ * prepare transaction 'txn_id'
+ * Consider the following scenario:
+ *
+ * begin;
+ * create table abc_def(a int, b int);
+ * insert into abc_def values(1,2),(3,4);
+ * prepare transaction 'abc';
+ *
+ * Now assume that the server is restarted for any reason.
+ * When prepared transactions are saved on disk, session level locks are
+ * ignored and hence when the prepared transactions are reterieved and all
+ * the other locks are reclaimed, but session level advisory locks are
+ * not reclaimed.
+ * Hence we made the following decisions
+ * a) Transaction level advisory locks should be used for DDLs which are
+ * automatically released at prepare transaction 'txn_id'
+ * b) If there is any uncomitted prepared transaction, it is assumed
+ * that it must be issuing a statement that belongs to disallowed
+ * group and hence the request to hold the advisory lock exclusively
+ * is denied.
+ */
+
+ /* Connect to SPI manager to check any prepared transactions */
+ if (SPI_connect() < 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_CONNECTION_EXCEPTION),
+ errmsg("internal error while locking the cluster for backup")));
+ }
+
+ /* Are there any prepared transactions that have not yet been committed? */
+ SPI_execute("select gid from pg_catalog.pg_prepared_xacts limit 1", true, 0);
+ prepared_xact_count = SPI_processed;
+ SPI_finish();
+
+ if (prepared_xact_count > 0)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("cannot lock cluster for backup in presence of %d uncommitted prepared transactions",
+ prepared_xact_count)));
+ }
+
+ /* try to acquire the advisory lock in exclusive mode */
+ lockAcquired = DatumGetBool(DirectFunctionCall2(
+ pg_try_advisory_lock_int4,
+ xc_lockForBackupKey1,
+ xc_lockForBackupKey2));
+
+ if (!lockAcquired)
+ ereport(ERROR,
+ (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+ errmsg("cannot lock cluster for backup, lock is already held")));
+
+ /*
+ * sessin level advisory locks stay for only as long as the session
+ * that issues them does
+ */
+ elog(INFO, "please do not close this session until you are done adding the new node");
+
+ /* will be true always */
+ PG_RETURN_BOOL(lockAcquired);
+}
+
+/*
+ * pgxc_lock_for_backup
+ *
+ * Lock the cluster for taking backup
+ * To lock the cluster, try to acquire a session level advisory lock exclusivly
+ * By lock we mean to disallow any statements that change
+ * the portions of the catalog which are backed up by pg_dump/pg_dumpall
+ * Returns true or fails with an error message.
+ */
+bool
+pgxc_lock_for_utility_stmt(Node *parsetree)
+{
+ bool lockAcquired;
+
+ lockAcquired = DatumGetBool(DirectFunctionCall2(
+ pg_try_advisory_xact_lock_shared_int4,
+ xc_lockForBackupKey1,
+ xc_lockForBackupKey2));
+
+ if (!lockAcquired)
+ ereport(ERROR,
+ (errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),
+ errmsg("cannot execute %s in a locked cluster",
+ CreateCommandTag(parsetree))));
+
+ return lockAcquired;
+}
+#endif
diff --git a/src/backend/utils/adt/pseudotypes.c b/src/backend/utils/adt/pseudotypes.c
index d7770b829a..39e84bf863 100644
--- a/src/backend/utils/adt/pseudotypes.c
+++ b/src/backend/utils/adt/pseudotypes.c
@@ -11,6 +11,11 @@
* we do better?)
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -26,7 +31,12 @@
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/rangetypes.h"
-
+#ifdef XCP
+#include "access/htup.h"
+#include "catalog/pg_type.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+#endif
/*
* cstring_in - input routine for pseudo-type CSTRING.
@@ -117,22 +127,80 @@ any_out(PG_FUNCTION_ARGS)
Datum
anyarray_in(PG_FUNCTION_ARGS)
{
+#ifdef XCP
+ /*
+ * XCP version of array_in() understands prefix describing element type
+ */
+ return array_in(fcinfo);
+#else
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot accept a value of type anyarray")));
PG_RETURN_VOID(); /* keep compiler quiet */
+#endif
}
/*
* anyarray_out - output routine for pseudo-type ANYARRAY.
*
* We may as well allow this, since array_out will in fact work.
+ * XCP needs to send from data nodes to coordinator values of that type.
+ * To be able to restore values at the destination node we need to know
+ * actual element type.
*/
Datum
anyarray_out(PG_FUNCTION_ARGS)
{
+#ifdef XCP
+ /*
+ * Output prefix: (type_namespace_name.typename) to look up actual element
+ * type at the destination node then output in usual format for array
+ */
+ ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
+ Oid element_type = ARR_ELEMTYPE(v);
+ Form_pg_type typeForm;
+ HeapTuple typeTuple;
+ char *typname,
+ *typnspname;
+ /* two identifiers, parenthesis, dot and trailing \0 */
+ char prefix[2*NAMEDATALEN+4],
+ *retval,
+ *newval;
+ int prefixlen, retvallen;
+ Datum array_out_result;
+ MemoryContext save_context;
+
+ save_context = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt);
+ /* Figure out type name and type namespace */
+ typeTuple = SearchSysCache(TYPEOID,
+ ObjectIdGetDatum(element_type),
+ 0, 0, 0);
+ if (!HeapTupleIsValid(typeTuple))
+ elog(ERROR, "cache lookup failed for type %u", element_type);
+ typeForm = (Form_pg_type) GETSTRUCT(typeTuple);
+ typname = NameStr(typeForm->typname);
+ typnspname = get_namespace_name(typeForm->typnamespace);
+
+ sprintf(prefix, "(%s.%s)", typnspname, typname);
+ ReleaseSysCache(typeTuple);
+ MemoryContextSwitchTo(save_context);
+
+ /* Get standard output and make up prefixed result */
+ array_out_result = array_out(fcinfo);
+ retval = DatumGetCString(array_out_result);
+ prefixlen = strlen(prefix);
+ retvallen = strlen(retval);
+ newval = (char *) palloc(prefixlen + retvallen + 1);
+ strcpy(newval, prefix);
+ strcpy(newval + prefixlen, retval);
+
+ pfree(retval);
+
+ PG_RETURN_CSTRING(newval);
+#else
return array_out(fcinfo);
+#endif
}
/*
diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c
index 721f2d7e65..f4d06305e0 100644
--- a/src/backend/utils/adt/ri_triggers.c
+++ b/src/backend/utils/adt/ri_triggers.c
@@ -13,6 +13,11 @@
* plan --- consider improving this someday.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
*
* src/backend/utils/adt/ri_triggers.c
@@ -267,7 +272,7 @@ RI_FKey_check(PG_FUNCTION_ARGS)
int i;
#ifdef PGXC
- /*
+ /*
* Referential integrity is not supported on Coordinator as it has no data, so
* we just come out of the function without actually performing any integrity checks.
*/
@@ -2646,7 +2651,9 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
const char *sep;
int i;
int save_nestlevel;
+#ifndef XCP
char workmembuf[32];
+#endif
int spi_result;
SPIPlanPtr qplan;
@@ -2790,10 +2797,17 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
*/
save_nestlevel = NewGUCNestLevel();
+#ifndef XCP
+ /*
+ * In multitenant extension we restrict permission on work_mem.
+ * This code may be executed by ordinary user, so skip this optimization.
+ * XXX look for workaround
+ */
snprintf(workmembuf, sizeof(workmembuf), "%d", maintenance_work_mem);
(void) set_config_option("work_mem", workmembuf,
PGC_USERSET, PGC_S_SESSION,
GUC_ACTION_SAVE, true, 0);
+#endif
if (SPI_connect() != SPI_OK_CONNECT)
elog(ERROR, "SPI_connect failed");
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index ad3fbb3b4b..7fbaeef351 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -4,6 +4,11 @@
* Functions to convert stored expressions/querytrees back to
* source text
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -56,7 +61,7 @@
#include "parser/parsetree.h"
#ifdef PGXC
#include "pgxc/pgxc.h"
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
#endif
#include "rewrite/rewriteHandler.h"
#include "rewrite/rewriteManip.h"
@@ -71,6 +76,7 @@
#include "utils/typcache.h"
#include "utils/xml.h"
+
/* ----------
* Pretty formatting constants
* ----------
@@ -109,10 +115,9 @@ typedef struct
int indentLevel; /* current indent level for prettyprint */
bool varprefix; /* TRUE to print prefixes on Vars */
#ifdef PGXC
+#ifndef XCP
bool finalise_aggs; /* should Datanode finalise the aggregates? */
- bool sortgroup_colno;/* instead of expression use resno for
- * sortgrouprefs.
- */
+#endif /* XCP */
#endif /* PGXC */
} deparse_context;
@@ -210,11 +215,7 @@ static void make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
static void make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
int prettyFlags);
static void get_query_def(Query *query, StringInfo buf, List *parentnamespace,
- TupleDesc resultDesc, int prettyFlags, int startIndent
-#ifdef PGXC
- , bool finalise_aggregates, bool sortgroup_colno
-#endif /* PGXC */
- );
+ TupleDesc resultDesc, int prettyFlags, int startIndent);
static void get_values_def(List *values_lists, deparse_context *context);
static void get_with_clause(Query *query, deparse_context *context);
static void get_select_query_def(Query *query, deparse_context *context,
@@ -729,7 +730,9 @@ pg_get_triggerdef_worker(Oid trigid, bool pretty)
context.varprefix = true;
context.prettyFlags = pretty ? PRETTYFLAG_PAREN : 0;
#ifdef PGXC
+#ifndef XCP
context.finalise_aggs = false;
+#endif /* XCP */
#endif /* PGXC */
context.indentLevel = PRETTYINDENT_STD;
@@ -2179,7 +2182,9 @@ deparse_expression_pretty(Node *expr, List *dpcontext,
context.varprefix = forceprefix;
context.prettyFlags = prettyFlags;
#ifdef PGXC
+#ifndef XCP
context.finalise_aggs = false;
+#endif /* XCP */
#endif /* PGXC */
context.indentLevel = startIndent;
@@ -2224,6 +2229,36 @@ deparse_context_for(const char *aliasname, Oid relid)
return list_make1(dpns);
}
+#ifdef PGXC
+List *
+deparse_context_for_remotequery(Alias *aliasname, Oid relid)
+{
+ deparse_namespace *dpns;
+ RangeTblEntry *rte;
+
+ dpns = (deparse_namespace *) palloc(sizeof(deparse_namespace));
+
+ /* Build a minimal RTE for the rel */
+ rte = makeNode(RangeTblEntry);
+ rte->rtekind = RTE_RELATION;
+ rte->relid = relid;
+ rte->eref = aliasname;
+ rte->inh = false;
+ rte->inFromCl = true;
+
+ /* Build one-element rtable */
+ dpns->rtable = list_make1(rte);
+ dpns->ctes = NIL;
+ dpns->planstate = NULL;
+ dpns->ancestors = NIL;
+ dpns->outer_planstate = dpns->inner_planstate = NULL;
+ dpns->remotequery = true;
+
+ /* Return a one-deep namespace stack */
+ return list_make1(dpns);
+}
+#endif
+
/*
* deparse_context_for_planstate - Build deparse context for a plan
*
@@ -2663,7 +2698,9 @@ make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
context.prettyFlags = prettyFlags;
context.indentLevel = PRETTYINDENT_STD;
#ifdef PGXC
+#ifndef XCP
context.finalise_aggs = false;
+#endif /* XCP */
#endif /* PGXC */
memset(&dpns, 0, sizeof(dpns));
@@ -2691,11 +2728,7 @@ make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
foreach(action, actions)
{
query = (Query *) lfirst(action);
- get_query_def(query, buf, NIL, NULL, prettyFlags, 0
-#ifdef PGXC
- , false, false
-#endif /* PGXC */
- );
+ get_query_def(query, buf, NIL, NULL, prettyFlags, 0);
if (prettyFlags)
appendStringInfo(buf, ";\n");
else
@@ -2712,11 +2745,7 @@ make_ruledef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
Query *query;
query = (Query *) linitial(actions);
- get_query_def(query, buf, NIL, NULL, prettyFlags, 0
-#ifdef PGXC
- , false, false
-#endif /* PGXC */
- );
+ get_query_def(query, buf, NIL, NULL, prettyFlags, 0);
appendStringInfo(buf, ";");
}
}
@@ -2784,11 +2813,7 @@ make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
ev_relation = heap_open(ev_class, AccessShareLock);
get_query_def(query, buf, NIL, RelationGetDescr(ev_relation),
- prettyFlags, 0
-#ifdef PGXC
- , false, false
-#endif /* PGXC */
- );
+ prettyFlags, 0);
appendStringInfo(buf, ";");
heap_close(ev_relation, AccessShareLock);
@@ -2804,11 +2829,168 @@ make_viewdef(StringInfo buf, HeapTuple ruletup, TupleDesc rulettc,
* ----------
*/
void
-deparse_query(Query *query, StringInfo buf, List *parentnamespace,
- bool finalise_aggs, bool sortgroup_colno)
+deparse_query(Query *query, StringInfo buf, List *parentnamespace)
+{
+ get_query_def(query, buf, parentnamespace, NULL, 0, 0);
+}
+
+/* code borrowed from get_insert_query_def */
+void
+get_query_def_from_valuesList(Query *query, StringInfo buf)
{
- get_query_def(query, buf, parentnamespace, NULL, 0, 0, finalise_aggs,
- sortgroup_colno);
+
+ RangeTblEntry *select_rte = NULL;
+ RangeTblEntry *values_rte = NULL;
+ RangeTblEntry *rte;
+ char *sep;
+ ListCell *values_cell;
+ ListCell *l;
+ List *strippedexprs;
+ deparse_context context;
+ deparse_namespace dpns;
+
+ /*
+ * Before we begin to examine the query, acquire locks on referenced
+ * relations, and fix up deleted columns in JOIN RTEs. This ensures
+ * consistent results. Note we assume it's OK to scribble on the passed
+ * querytree!
+ */
+ AcquireRewriteLocks(query, false);
+
+ context.buf = buf;
+ context.namespaces = NIL;
+ context.windowClause = NIL;
+ context.windowTList = NIL;
+ context.varprefix = (list_length(query->rtable) != 1);
+ context.prettyFlags = 0;
+ context.indentLevel = 0;
+#ifdef PGXC
+#ifndef XCP
+ context.finalise_aggs = query->qry_finalise_aggs;
+#endif /* XCP */
+#endif /* PGXC */
+
+ dpns.rtable = query->rtable;
+ dpns.ctes = query->cteList;
+ dpns.planstate = NULL;
+ dpns.ancestors = NIL;
+ dpns.outer_planstate = dpns.inner_planstate = NULL;
+ dpns.remotequery = false;
+
+ /*
+ * If it's an INSERT ... SELECT or VALUES (...), (...), ... there will be
+ * a single RTE for the SELECT or VALUES.
+ */
+ foreach(l, query->rtable)
+ {
+ rte = (RangeTblEntry *) lfirst(l);
+
+ if (rte->rtekind == RTE_SUBQUERY)
+ {
+ if (select_rte)
+ elog(ERROR, "too many subquery RTEs in INSERT");
+ select_rte = rte;
+ }
+
+ if (rte->rtekind == RTE_VALUES)
+ {
+ if (values_rte)
+ elog(ERROR, "too many values RTEs in INSERT");
+ values_rte = rte;
+ }
+ }
+ if (select_rte && values_rte)
+ elog(ERROR, "both subquery and values RTEs in INSERT");
+
+ /*
+ * Start the query with INSERT INTO relname
+ */
+ rte = rt_fetch(query->resultRelation, query->rtable);
+ Assert(rte->rtekind == RTE_RELATION);
+
+ appendStringInfo(buf, "INSERT INTO %s (",
+ generate_relation_name(rte->relid, NIL));
+
+ /*
+ * Add the insert-column-names list. To handle indirection properly, we
+ * need to look for indirection nodes in the top targetlist (if it's
+ * INSERT ... SELECT or INSERT ... single VALUES), or in the first
+ * expression list of the VALUES RTE (if it's INSERT ... multi VALUES). We
+ * assume that all the expression lists will have similar indirection in
+ * the latter case.
+ */
+ if (values_rte)
+ values_cell = list_head((List *) linitial(values_rte->values_lists));
+ else
+ values_cell = NULL;
+ strippedexprs = NIL;
+ sep = "";
+ foreach(l, query->targetList)
+ {
+ TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+ elog(DEBUG1, "targetEntry type is %d\n)", tle->expr->type);
+ if (tle->resjunk || !IsA(tle->expr, Var))
+ continue; /* ignore junk entries */
+
+ appendStringInfoString(buf, sep);
+ sep = ", ";
+
+ /*
+ * Put out name of target column; look in the catalogs, not at
+ * tle->resname, since resname will fail to track RENAME.
+ */
+ appendStringInfoString(buf,quote_identifier(get_relid_attribute_name(rte->relid, tle->resno)));
+
+ /*
+ * Print any indirection needed (subfields or subscripts), and strip
+ * off the top-level nodes representing the indirection assignments.
+ */
+ if (values_cell)
+ {
+ /* we discard the stripped expression in this case */
+ processIndirection((Node *) lfirst(values_cell), &context, true);
+ values_cell = lnext(values_cell);
+ }
+ else
+ {
+ /* we keep a list of the stripped expressions in this case */
+ strippedexprs = lappend(strippedexprs, processIndirection((Node *) tle->expr, &context, true));
+ }
+ }
+ appendStringInfo(buf, ") ");
+
+ if (select_rte)
+ {
+ /* Add the SELECT */
+ get_query_def(select_rte->subquery, buf, NIL, NULL,
+ context.prettyFlags, context.indentLevel);
+ }
+ else if (values_rte)
+ {
+ /* A WITH clause is possible here */
+ get_with_clause(query, &context);
+ /* Add the multi-VALUES expression lists */
+ get_values_def(values_rte->values_lists, &context);
+ }
+ else
+ {
+ /* A WITH clause is possible here */
+ get_with_clause(query, &context);
+ /* Add the single-VALUES expression list */
+ appendContextKeyword(&context, "VALUES (",
+ -PRETTYINDENT_STD, PRETTYINDENT_STD, 2);
+ get_rule_expr((Node *) strippedexprs, &context, false);
+ appendStringInfoChar(buf, ')');
+ }
+
+ /* Add RETURNING if present */
+ if (query->returningList)
+ {
+ appendContextKeyword(&context, " RETURNING",
+ -PRETTYINDENT_STD, PRETTYINDENT_STD, 1);
+ get_target_list(query->returningList, &context, NULL);
+ }
}
#endif
/* ----------
@@ -2820,11 +3002,7 @@ deparse_query(Query *query, StringInfo buf, List *parentnamespace,
*/
static void
get_query_def(Query *query, StringInfo buf, List *parentnamespace,
- TupleDesc resultDesc, int prettyFlags, int startIndent
-#ifdef PGXC
- , bool finalise_aggs, bool sortgroup_colno
-#endif /* PGXC */
- )
+ TupleDesc resultDesc, int prettyFlags, int startIndent)
{
deparse_context context;
deparse_namespace dpns;
@@ -2846,8 +3024,9 @@ get_query_def(Query *query, StringInfo buf, List *parentnamespace,
context.prettyFlags = prettyFlags;
context.indentLevel = startIndent;
#ifdef PGXC
- context.finalise_aggs = finalise_aggs;
- context.sortgroup_colno = sortgroup_colno;
+#ifndef XCP
+ context.finalise_aggs = query->qry_finalise_aggs;
+#endif /* XCP */
#endif /* PGXC */
memset(&dpns, 0, sizeof(dpns));
@@ -2986,11 +3165,7 @@ get_with_clause(Query *query, deparse_context *context)
if (PRETTY_INDENT(context))
appendContextKeyword(context, "", 0, 0, 0);
get_query_def((Query *) cte->ctequery, buf, context->namespaces, NULL,
- context->prettyFlags, context->indentLevel
-#ifdef PGXC
- , context->finalise_aggs, context->sortgroup_colno
-#endif /* PGXC */
- );
+ context->prettyFlags, context->indentLevel);
if (PRETTY_INDENT(context))
appendContextKeyword(context, "", 0, 0, 0);
appendStringInfoChar(buf, ')');
@@ -3393,11 +3568,7 @@ get_setop_query(Node *setOp, Query *query, deparse_context *context,
if (need_paren)
appendStringInfoChar(buf, '(');
get_query_def(subquery, buf, context->namespaces, resultDesc,
- context->prettyFlags, context->indentLevel
-#ifdef PGXC
- , context->finalise_aggs, context->sortgroup_colno
-#endif /* PGXC */
- );
+ context->prettyFlags, context->indentLevel);
if (need_paren)
appendStringInfoChar(buf, ')');
}
@@ -3493,7 +3664,7 @@ get_rule_sortgroupclause(SortGroupClause *srt, List *tlist, bool force_colno,
* dump it without any decoration. Otherwise, just dump the expression
* normally.
*/
- if (force_colno || context->sortgroup_colno)
+ if (force_colno)
{
Assert(!tle->resjunk);
appendStringInfo(buf, "%d", tle->resno);
@@ -3716,6 +3887,7 @@ get_insert_query_def(Query *query, deparse_context *context)
get_with_clause(query, context);
#ifdef PGXC
+#ifndef XCP
/*
* In the case of "INSERT ... DEFAULT VALUES" analyzed in pgxc planner,
* return the sql statement directly if the table has no default values.
@@ -3725,32 +3897,9 @@ get_insert_query_def(Query *query, deparse_context *context)
appendStringInfo(buf, "%s", query->sql_statement);
return;
}
-
- /*
- * select_rte and values_rte are not required by INSERT queries in XC
- * Both these should stay null for INSERT queries to work corretly
- * Consider an example
- * create table tt as values(1,'One'),(2,'Two');
- * This query uses values_rte, but we do not need them in XC
- * because it gets broken down into two queries
- * CREATE TABLE tt(column1 int4, column2 text)
- * and
- * INSERT INTO tt (column1, column2) VALUES ($1, $2)
- * Note that the insert query does not need values_rte
- *
- * Now consider another example
- * insert into tt select * from tt
- * This query uses select_rte, but again that is not required in XC
- * Again here the query gets broken down into two queries
- * SELECT column1, column2 FROM ONLY tt WHERE true
- * and
- * INSERT INTO tt (column1, column2) VALUES ($1, $2)
- * Note again that the insert query does not need select_rte
- * Hence we keep both select_rte and values_rte NULL.
- */
- if (!(IS_PGXC_COORDINATOR && !IsConnFromCoord()))
- {
#endif
+#endif
+
/*
* If it's an INSERT ... SELECT or VALUES (...), (...), ... there will be
* a single RTE for the SELECT or VALUES.
@@ -3773,11 +3922,23 @@ get_insert_query_def(Query *query, deparse_context *context)
values_rte = rte;
}
}
+ if (select_rte && values_rte)
+ elog(ERROR, "both subquery and values RTEs in INSERT");
+
#ifdef PGXC
+#ifndef XCP
+ /*
+ * If it's an INSERT ... SELECT or VALUES (...), (...), ...
+ * sql_statement is rewritten and assigned in RewriteQuery.
+ * Just return it here.
+ */
+ if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && values_rte != NULL)
+ {
+ appendStringInfo(buf, "%s", query->sql_statement);
+ return;
}
#endif
- if (select_rte && values_rte)
- elog(ERROR, "both subquery and values RTEs in INSERT");
+#endif
/*
* Start the query with INSERT INTO relname
*/
@@ -3848,11 +4009,7 @@ get_insert_query_def(Query *query, deparse_context *context)
{
/* Add the SELECT */
get_query_def(select_rte->subquery, buf, NIL, NULL,
- context->prettyFlags, context->indentLevel
-#ifdef PGXC
- , context->finalise_aggs, context->sortgroup_colno
-#endif /* PGXC */
- );
+ context->prettyFlags, context->indentLevel);
}
else if (values_rte)
{
@@ -4073,6 +4230,12 @@ get_utility_query_def(Query *query, deparse_context *context)
{
ColumnDef *coldef = (ColumnDef *) node;
TypeName *typename = coldef->typeName;
+#ifdef XCP
+ appendStringInfo(buf, "%s %s",
+ quote_identifier(coldef->colname),
+ format_type_with_typemod(typename->typeOid,
+ typename->typemod));
+#else
Type type;
/* error out if we have no recourse at all */
@@ -4092,6 +4255,7 @@ get_utility_query_def(Query *query, deparse_context *context)
appendStringInfo(buf, "%s %s", quote_identifier(coldef->colname),
typeTypeName(type));
ReleaseSysCache(type);
+#endif
}
else
elog(ERROR, "Invalid table column definition.");
@@ -6563,6 +6727,7 @@ get_agg_expr(Aggref *aggref, deparse_context *context)
}
#ifdef PGXC
+#ifndef XCP
/*
* Datanode should send finalised aggregate results. Datanodes evaluate only
* transition results. In order to get the finalised aggregate, we enclose
@@ -6589,6 +6754,7 @@ get_agg_expr(Aggref *aggref, deparse_context *context)
}
ReleaseSysCache(aggTuple);
}
+#endif /* XCP */
#endif /* PGXC */
appendStringInfo(buf, "%s(%s",
@@ -7018,11 +7184,7 @@ get_sublink_expr(SubLink *sublink, deparse_context *context)
appendStringInfoChar(buf, '(');
get_query_def(query, buf, context->namespaces, NULL,
- context->prettyFlags, context->indentLevel
-#ifdef PGXC
- , context->finalise_aggs, context->sortgroup_colno
-#endif /* PGXC */
- );
+ context->prettyFlags, context->indentLevel);
if (need_paren)
appendStringInfo(buf, "))");
@@ -7144,11 +7306,7 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context)
/* Subquery RTE */
appendStringInfoChar(buf, '(');
get_query_def(rte->subquery, buf, context->namespaces, NULL,
- context->prettyFlags, context->indentLevel,
-#ifdef PGXC
- context->finalise_aggs, context->sortgroup_colno
-#endif /* PGXC */
- );
+ context->prettyFlags, context->indentLevel);
appendStringInfoChar(buf, ')');
break;
case RTE_FUNCTION:
diff --git a/src/backend/utils/adt/version.c b/src/backend/utils/adt/version.c
index df0bab74f5..5dbd742b6f 100644
--- a/src/backend/utils/adt/version.c
+++ b/src/backend/utils/adt/version.c
@@ -3,6 +3,11 @@
* version.c
* Returns the PostgreSQL version string
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Copyright (c) 1998-2012, PostgreSQL Global Development Group
*
* IDENTIFICATION
@@ -24,9 +29,11 @@ pgsql_version(PG_FUNCTION_ARGS)
}
#ifdef PGXC
+#ifndef XCP
Datum
pgxc_version(PG_FUNCTION_ARGS)
{
PG_RETURN_TEXT_P(cstring_to_text(PGXC_VERSION_STR));
}
#endif
+#endif
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 9ccfc4f114..1c189fab6a 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -85,6 +85,11 @@
* problems can be overcome cheaply.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -98,6 +103,9 @@
#include "access/xact.h"
#include "catalog/catalog.h"
#include "miscadmin.h"
+#ifdef XCP
+#include "pgxc/pgxc.h"
+#endif
#include "storage/sinval.h"
#include "storage/smgr.h"
#include "utils/inval.h"
@@ -831,7 +839,18 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
void
AtEOXact_Inval(bool isCommit)
{
+#ifdef XCP
+ /*
+ * In our code, the distributed session may run on multiple backends,
+ * and we need to broadcast invalidation messages so they reach other
+ * backends even * in case of rollback. If the session runs on single
+ * backend the invalidation messages may be still applied locally.
+ * So the criteria may be more complex.
+ */
+ if (isCommit || IS_PGXC_DATANODE)
+#else
if (isCommit)
+#endif
{
/* Must be at top of stack */
Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL);
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index db996829b0..8971b05b3a 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -3,6 +3,11 @@
* lsyscache.c
* Convenience routines for common queries in the system catalog cache.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -1005,6 +1010,78 @@ get_collation_name(Oid colloid)
return NULL;
}
+
+#ifdef XCP
+/*
+ * get_collation_namespace
+ * Returns the namespace id of a given pg_collation entry.
+ *
+ * Returns an Oid of the collation's namespace.
+ */
+Oid
+get_collation_namespace(Oid colloid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(colloid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_collation colltup = (Form_pg_collation) GETSTRUCT(tp);
+ Oid result;
+
+ result = colltup->collnamespace;
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return InvalidOid;
+}
+
+
+/*
+ * get_collation_encoding
+ * Returns the encoding of a given pg_collation entry.
+ *
+ * Returns the collation's encoding, or -1 if entry does not exist.
+ */
+int32
+get_collation_encoding(Oid colloid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(colloid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_collation colltup = (Form_pg_collation) GETSTRUCT(tp);
+ int32 result;
+
+ result = colltup->collencoding;
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return -1;
+}
+
+
+/*
+ * get_collid
+ * Given a collation name, encoding and namespace OID, look up
+ * the collation OID.
+ *
+ * Returns InvalidOid if there is no such collation
+ */
+Oid
+get_collid(const char *collname, int32 collencoding, Oid collnsp)
+{
+ return GetSysCacheOid(COLLNAMEENCNSP,
+ CStringGetDatum(collname),
+ Int32GetDatum(collencoding),
+ ObjectIdGetDatum(collnsp),
+ 0);
+}
+#endif
+
/* ---------- CONSTRAINT CACHE ---------- */
/*
@@ -3172,6 +3249,159 @@ get_namespace_name(Oid nspid)
return NULL;
}
+
+#ifdef XCP
+/*
+ * Routines to get info to encode/decode oids when sending between nodes
+ */
+
+/*
+ * get_namespaceid
+ * Given a namespace name, look up the namespace OID.
+ *
+ * Returns InvalidOid if there is no such namespace
+ */
+Oid
+get_namespaceid(const char *nspname)
+{
+ return GetSysCacheOid(NAMESPACENAME,
+ CStringGetDatum(nspname),
+ 0, 0, 0);
+}
+
+/*
+ * get_typ_name
+ *
+ * Given the type OID, find the type name
+ * It returns palloc'd copy of the name or NULL if the cache lookup fails...
+ */
+char *
+get_typ_name(Oid typid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache(TYPEOID,
+ ObjectIdGetDatum(typid),
+ 0, 0, 0);
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_type typtup = (Form_pg_type) GETSTRUCT(tp);
+ char *result;
+
+ result = pstrdup(NameStr(typtup->typname));
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return NULL;
+}
+
+/*
+ * get_typ_namespace
+ *
+ * Given the type OID, find the namespace
+ * It returns InvalidOid if the cache lookup fails...
+ */
+Oid
+get_typ_namespace(Oid typid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache(TYPEOID,
+ ObjectIdGetDatum(typid),
+ 0, 0, 0);
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_type typtup = (Form_pg_type) GETSTRUCT(tp);
+ Oid result;
+
+ result = typtup->typnamespace;
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return InvalidOid;
+}
+
+/*
+ * get_typname_typid
+ * Given a type name and namespace OID, look up the type OID.
+ *
+ * Returns InvalidOid if there is no such type
+ */
+Oid
+get_typname_typid(const char *typname, Oid typnamespace)
+{
+ return GetSysCacheOid(TYPENAMENSP,
+ CStringGetDatum(typname),
+ ObjectIdGetDatum(typnamespace),
+ 0, 0);
+}
+
+/*
+ * get_funcid
+ * Given a function name, argument types and namespace OID, look up
+ * the function OID.
+ *
+ * Returns InvalidOid if there is no such function
+ */
+Oid
+get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp)
+{
+ return GetSysCacheOid(PROCNAMEARGSNSP,
+ CStringGetDatum(funcname),
+ PointerGetDatum(argtypes),
+ ObjectIdGetDatum(funcnsp),
+ 0);
+}
+
+/*
+ * get_opnamespace
+ * Given an opno, find the namespace
+ *
+ * Returns InvalidOid if there is no such operator
+ */
+Oid
+get_opnamespace(Oid opno)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache(OPEROID,
+ ObjectIdGetDatum(opno),
+ 0, 0, 0);
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_operator optup = (Form_pg_operator) GETSTRUCT(tp);
+ Oid result;
+
+ result = optup->oprnamespace;
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return InvalidOid;
+}
+
+/*
+ * get_operid
+ * Given an operator name, argument types and namespace OID, look up
+ * the operator OID.
+ *
+ * Returns InvalidOid if there is no such operator
+ */
+Oid
+get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp)
+{
+ return GetSysCacheOid(OPERNAMENSP,
+ CStringGetDatum(oprname),
+ ObjectIdGetDatum(oprleft),
+ ObjectIdGetDatum(oprright),
+ ObjectIdGetDatum(oprnsp));
+}
+
+#endif
+
+
/* ---------- PG_RANGE CACHE ---------- */
/*
diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c
index ab8d8a491e..c34d514997 100644
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@ -35,6 +35,11 @@
* be infrequent enough that more-detailed tracking is not worth the effort.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -66,6 +71,9 @@
#ifdef PGXC
#include "commands/prepare.h"
#include "pgxc/execRemote.h"
+#ifdef XCP
+#include "pgxc/squeue.h"
+#endif
#include "pgxc/pgxc.h"
@@ -319,8 +327,10 @@ CompleteCachedPlan(CachedPlanSource *plansource,
plansource->cursor_options = cursor_options;
plansource->fixed_result = fixed_result;
#ifdef PGXC
+#ifndef XCP
plansource->stmt_name = NULL;
#endif
+#endif
plansource->resultDesc = PlanCacheComputeResultDesc(querytree_list);
MemoryContextSwitchTo(oldcxt);
@@ -438,6 +448,7 @@ ReleaseGenericPlan(CachedPlanSource *plansource)
CachedPlan *plan = plansource->gplan;
#ifdef PGXC
+#ifndef XCP
/* Drop this plan on remote nodes */
if (plan)
{
@@ -456,6 +467,19 @@ ReleaseGenericPlan(CachedPlanSource *plansource)
}
}
#endif
+#endif
+
+#ifdef XCP
+ /* Release SharedQueue if still held */
+ if (IsConnFromDatanode() && plan && list_length(plan->stmt_list) == 1)
+ {
+ PlannedStmt *pstmt;
+
+ pstmt = (PlannedStmt *) linitial(plan->stmt_list);
+ if (IsA(pstmt, PlannedStmt) && pstmt->pname)
+ SharedQueueRelease(pstmt->pname);
+ }
+#endif
Assert(plan->magic == CACHEDPLAN_MAGIC);
plansource->gplan = NULL;
@@ -535,6 +559,9 @@ RevalidateCachedQuery(CachedPlanSource *plansource)
MemoryContextDelete(qcxt);
}
+ /* Drop the generic plan reference if any */
+ ReleaseGenericPlan(plansource);
+
/*
* Now re-do parse analysis and rewrite. This not incidentally acquires
* the locks we need to do planning safely.
@@ -1141,9 +1168,7 @@ ReleaseCachedPlan(CachedPlan *plan, bool useResOwner)
Assert(plan->refcount > 0);
plan->refcount--;
if (plan->refcount == 0)
- {
MemoryContextDelete(plan->context);
- }
}
/*
@@ -1536,6 +1561,9 @@ PlanCacheComputeResultDesc(List *stmt_list)
switch (ChoosePortalStrategy(stmt_list))
{
+#ifdef XCP
+ case PORTAL_DISTRIBUTED:
+#endif
case PORTAL_ONE_SELECT:
case PORTAL_ONE_MOD_WITH:
query = (Query *) linitial(stmt_list);
@@ -1758,3 +1786,99 @@ ResetPlanCache(void)
}
}
}
+
+
+#ifdef XCP
+void
+SetRemoteSubplan(CachedPlanSource *plansource, const char *plan_string)
+{
+ CachedPlan *plan;
+ MemoryContext plan_context;
+ MemoryContext oldcxt;
+ RemoteStmt *rstmt;
+ PlannedStmt *stmt;
+
+ Assert(IS_PGXC_DATANODE);
+ Assert(plansource->raw_parse_tree == NULL);
+ Assert(plansource->query_list == NIL);
+
+ /*
+ * Make dedicated query context to store cached plan. It is in current
+ * memory context for now, later it will be reparented to
+ * CachedMemoryContext. If it is in CachedMemoryContext initially we would
+ * have to destroy it in case of error.
+ */
+ plan_context = AllocSetContextCreate(CurrentMemoryContext,
+ "CachedPlan",
+ ALLOCSET_SMALL_MINSIZE,
+ ALLOCSET_SMALL_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ oldcxt = MemoryContextSwitchTo(plan_context);
+
+ /*
+ * Restore query plan.
+ */
+ set_portable_input(true);
+ rstmt = (RemoteStmt *) stringToNode((char *) plan_string);
+ set_portable_input(false);
+
+ stmt = makeNode(PlannedStmt);
+
+ stmt->commandType = rstmt->commandType;
+ stmt->hasReturning = rstmt->hasReturning;
+ stmt->canSetTag = true;
+ stmt->transientPlan = false; // ???
+ stmt->planTree = rstmt->planTree;
+ stmt->rtable = rstmt->rtable;
+ stmt->resultRelations = rstmt->resultRelations;
+ stmt->utilityStmt = NULL;
+ stmt->subplans = rstmt->subplans;
+ stmt->rewindPlanIDs = NULL;
+ stmt->rowMarks = rstmt->rowMarks;
+ stmt->relationOids = NIL;
+ stmt->invalItems = NIL;
+ stmt->nParamExec = rstmt->nParamExec;
+ stmt->nParamRemote = rstmt->nParamRemote;
+ stmt->remoteparams = rstmt->remoteparams;
+ stmt->pname = plansource->stmt_name;
+ stmt->distributionType = rstmt->distributionType;
+ stmt->distributionKey = rstmt->distributionKey;
+ stmt->distributionNodes = rstmt->distributionNodes;
+ stmt->distributionRestrict = rstmt->distributionRestrict;
+
+ /*
+ * Set up SharedQueue if intermediate results need to be distributed
+ * on multiple destination Datanodes.
+ */
+ if (IsConnFromDatanode() && stmt->pname &&
+ list_length(stmt->distributionRestrict) > 1)
+ SharedQueueAcquire(stmt->pname,
+ list_length(stmt->distributionRestrict) - 1);
+
+ /*
+ * Create and fill the CachedPlan struct within the new context.
+ */
+ plan = (CachedPlan *) palloc(sizeof(CachedPlan));
+ plan->magic = CACHEDPLAN_MAGIC;
+ plan->stmt_list = list_make1(stmt);
+ plan->saved_xmin = InvalidTransactionId;
+ plan->refcount = 1; /* will be referenced by plansource */
+ plan->context = plan_context;
+ if (plansource->is_saved)
+ {
+ MemoryContextSetParent(plan_context, CacheMemoryContext);
+ plan->is_saved = true;
+ }
+ else
+ {
+ MemoryContextSetParent(plan_context,
+ MemoryContextGetParent(plansource->context));
+ plan->is_saved = false;
+ }
+ plan->is_valid = true;
+
+ plansource->gplan = plan;
+
+ MemoryContextSwitchTo(oldcxt);
+}
+#endif
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 4df7547e1c..dcdae41fdf 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -3,6 +3,11 @@
* relcache.c
* POSTGRES relation descriptor cache code
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -860,7 +865,15 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
break;
case RELPERSISTENCE_TEMP:
if (isTempOrToastNamespace(relation->rd_rel->relnamespace))
+ {
+#ifdef XCP
+ relation->rd_backend = OidIsValid(MyCoordId) ?
+ MyFirstBackendId : MyBackendId;
+#else
+
relation->rd_backend = MyBackendId;
+#endif
+ }
else
{
/*
@@ -901,9 +914,14 @@ RelationBuildDesc(Oid targetRelId, bool insertIt)
relation->trigdesc = NULL;
#ifdef PGXC
+#ifdef XCP
+ if (IS_PGXC_COORDINATOR &&
+ relation->rd_id >= FirstNormalObjectId)
+#else
if (IS_PGXC_COORDINATOR &&
relation->rd_id >= FirstNormalObjectId &&
!IsAutoVacuumWorkerProcess())
+#endif
RelationBuildLocator(relation);
#endif
/*
@@ -2542,6 +2560,11 @@ RelationBuildLocalRelation(const char *relname,
rel->rd_backend = InvalidBackendId;
break;
case RELPERSISTENCE_TEMP:
+#ifdef XCP
+ if (OidIsValid(MyCoordId))
+ rel->rd_backend = MyFirstBackendId;
+ else
+#endif
rel->rd_backend = MyBackendId;
break;
default:
@@ -2905,6 +2928,7 @@ RelationCacheInitializePhase3(void)
TriggerRelationId);
#define NUM_CRITICAL_LOCAL_INDEXES 7 /* fix if you change list above */
+
criticalRelcachesBuilt = true;
}
diff --git a/src/backend/utils/errcodes.txt b/src/backend/utils/errcodes.txt
index 3e04164956..ca8a543314 100644
--- a/src/backend/utils/errcodes.txt
+++ b/src/backend/utils/errcodes.txt
@@ -461,3 +461,4 @@ Section: Class XX - Internal Error
XX000 E ERRCODE_INTERNAL_ERROR internal_error
XX001 E ERRCODE_DATA_CORRUPTED data_corrupted
XX002 E ERRCODE_INDEX_CORRUPTED index_corrupted
+XX010 E ERRCODE_PRODUCER_ERROR
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index fb46ab7218..3a78d7ab17 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -3,6 +3,11 @@
* globals.c
* global variable declarations
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -59,6 +64,14 @@ char postgres_exec_path[MAXPGPATH]; /* full path to backend */
/* note: currently this is not valid in backend processes */
#endif
+#ifdef XCP
+Oid MyCoordId = InvalidOid;
+
+int MyCoordPid = 0;
+
+BackendId MyFirstBackendId = InvalidBackendId;
+#endif
+
BackendId MyBackendId = InvalidBackendId;
Oid MyDatabaseId = InvalidOid;
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 6b2833e1cb..ce4bb71366 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -3,6 +3,11 @@
* miscinit.c
* miscellaneous initialization support stuff
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -29,9 +34,15 @@
#include <utime.h>
#endif
+#ifdef XCP
+#include "catalog/namespace.h"
+#endif
#include "catalog/pg_authid.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
+#ifdef XCP
+#include "pgxc/execRemote.h"
+#endif
#include "postmaster/autovacuum.h"
#include "postmaster/postmaster.h"
#include "storage/fd.h"
@@ -543,6 +554,117 @@ SetSessionAuthorization(Oid userid, bool is_superuser)
PGC_INTERNAL, PGC_S_OVERRIDE);
}
+
+#ifdef XCP
+void
+SetGlobalSession(Oid coordid, int coordpid)
+{
+ bool reset = false;
+ BackendId firstBackend = InvalidBackendId;
+ int bCount = 0;
+ int bPids[MaxBackends];
+
+ /* If nothing changed do nothing */
+ if (MyCoordId == coordid && MyCoordPid == coordpid)
+ return;
+
+ /*
+ * Need to reset pool manager agent if the backend being assigned to
+ * different global session or assignment is canceled.
+ */
+ if (OidIsValid(MyCoordId) &&
+ (MyCoordId != coordid || MyCoordPid != coordpid))
+ reset = true;
+
+retry:
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ /* Expose distributed session id in the PGPROC structure */
+ MyProc->coordId = coordid;
+ MyProc->coordPid = coordpid;
+ /*
+ * Determine first backend id.
+ * If this backend is the first backend of the distributed session on the
+ * node we should clean up the temporary namespace.
+ * Backend is the first if no backends with such distributed session id.
+ * If such backends are found we can copy first found valid firstBackendId.
+ * If none of them valid that means the first is still cleaning up the
+ * temporary namespace.
+ */
+ if (OidIsValid(coordid))
+ firstBackend = GetFirstBackendId(&bCount, bPids);
+ else
+ firstBackend = InvalidBackendId;
+ /* If first backend id is defined set it right now */
+ if (firstBackend != InvalidBackendId)
+ MyProc->firstBackendId = firstBackend;
+ LWLockRelease(ProcArrayLock);
+
+ if (OidIsValid(coordid) && firstBackend == InvalidBackendId)
+ {
+ /*
+ * We are the first or need to retry
+ */
+ if (bCount > 0)
+ {
+ /* XXX sleep ? */
+ goto retry;
+ }
+ else
+ {
+ /* Set globals for this backend */
+ MyCoordId = coordid;
+ MyCoordPid = coordpid;
+ MyFirstBackendId = MyBackendId;
+ /* XXX Maybe this lock is not needed because of atomic operation? */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ MyProc->firstBackendId = MyBackendId;
+ LWLockRelease(ProcArrayLock);
+ }
+ }
+ else
+ {
+ /* Set globals for this backend */
+ MyCoordId = coordid;
+ MyCoordPid = coordpid;
+ MyFirstBackendId = firstBackend;
+ }
+
+ if (reset)
+ {
+ /*
+ * Next time when backend will be assigned to a global session it will
+ * be referencing different temp namespace
+ */
+ ForgetTempTableNamespace();
+ /*
+ * Forget all local and session parameters cached for the Datanodes.
+ * They do not belong to that session.
+ */
+ PGXCNodeResetParams(false);
+ /*
+ * Release node connections, if still held.
+ */
+ release_handles();
+ /*
+ * XXX Do other stuff like release secondary Datanode connections,
+ * clean up shared queues ???
+ */
+ }
+}
+
+
+/*
+ * Returns the name of the role that should be used to access other cluster
+ * nodes.
+ */
+char *
+GetClusterUserName(void)
+{
+ return GetUserNameFromId(AuthenticatedUserId);
+}
+#endif
+
+
/*
* Report current role id
* This follows the semantics of SET ROLE, ie return the outer-level ID
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 93da70681b..282a74666a 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -3,6 +3,11 @@
* postinit.c
* postgres initialization utilities
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -34,6 +39,9 @@
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "pgstat.h"
+#ifdef XCP
+#include "pgxc/pgxc.h"
+#endif
#include "postmaster/autovacuum.h"
#include "postmaster/postmaster.h"
#include "replication/walsender.h"
@@ -305,6 +313,9 @@ CheckMyDatabase(const char *name, bool am_superuser)
* just document that the connection limit is approximate.
*/
if (dbform->datconnlimit >= 0 &&
+#ifdef XCP
+ IS_PGXC_COORDINATOR &&
+#endif
!am_superuser &&
CountDBBackends(MyDatabaseId) > dbform->datconnlimit)
ereport(FATAL,
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index e5d95457d7..cdd82bcdc4 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -6,6 +6,11 @@
* See src/backend/utils/misc/README for more information.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Copyright (c) 2000-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
* Written by Peter Eisentraut <[email protected]>.
@@ -59,14 +64,20 @@
#ifdef PGXC
#include "commands/tablecmds.h"
#include "nodes/nodes.h"
-#include "optimizer/pgxcship.h"
#include "pgxc/execRemote.h"
#include "pgxc/locator.h"
-#include "optimizer/pgxcplan.h"
+#include "pgxc/planner.h"
#include "pgxc/poolmgr.h"
#include "pgxc/nodemgr.h"
#include "pgxc/xc_maintenance_mode.h"
#endif
+#ifdef XCP
+#include "commands/sequence.h"
+#include "pgxc/nodemgr.h"
+#include "pgxc/squeue.h"
+#include "utils/snapmgr.h"
+#include "parser/parse_utilcmd.h"
+#endif
#include "postmaster/autovacuum.h"
#include "postmaster/bgwriter.h"
#include "postmaster/postmaster.h"
@@ -201,8 +212,10 @@ static bool check_ssl(bool *newval, void **extra, GucSource source);
static bool check_stage_log_stats(bool *newval, void **extra, GucSource source);
static bool check_log_stats(bool *newval, void **extra, GucSource source);
#ifdef PGXC
+#ifndef XCP
static bool check_pgxc_maintenance_mode(bool *newval, void **extra, GucSource source);
#endif
+#endif
static bool check_canonical_path(char **newval, void **extra, GucSource source);
static bool check_timezone_abbreviations(char **newval, void **extra, GucSource source);
static void assign_timezone_abbreviations(const char *newval, void *extra);
@@ -229,6 +242,10 @@ static const char *show_log_file_mode(void);
static char *config_enum_get_options(struct config_enum * record,
const char *prefix, const char *suffix,
const char *separator);
+#ifdef XCP
+static bool check_storm_catalog_remap_string(char **newval,
+ void **extra, GucSource source);
+#endif
/*
@@ -479,6 +496,10 @@ int tcp_keepalives_idle;
int tcp_keepalives_interval;
int tcp_keepalives_count;
+#ifdef XCP
+char *storm_catalog_remap_string;
+#endif
+
/*
* These variables are all dummies that don't do anything, except in some
* cases provide the value for SHOW to display. The real state is elsewhere
@@ -502,6 +523,9 @@ static char *log_timezone_string;
static char *timezone_abbreviations_string;
static char *XactIsoLevel_string;
static char *session_authorization_string;
+#ifdef XCP
+static char *global_session_string;
+#endif
static int max_function_args;
static int max_index_keys;
static int max_identifier_length;
@@ -808,6 +832,7 @@ static struct config_bool ConfigureNamesBool[] =
NULL, NULL, NULL
},
#ifdef PGXC
+#ifndef XCP
{
{"enable_remotejoin", PGC_USERSET, QUERY_TUNING_METHOD,
gettext_noop("Enables the planner's use of remote join plans."),
@@ -835,25 +860,21 @@ static struct config_bool ConfigureNamesBool[] =
true,
NULL, NULL, NULL
},
+#else
{
- {"enable_remotesort", PGC_USERSET, QUERY_TUNING_METHOD,
- gettext_noop("Enables the planner's use of remote sort plans."),
- NULL
- },
- &enable_remotesort,
- true,
- NULL, NULL, NULL
- },
- {
- {"enable_remotelimit", PGC_USERSET, QUERY_TUNING_METHOD,
- gettext_noop("Enables the planner's use of remote limit plans."),
- NULL
+ {"loose_constraints", PGC_USERSET, COORDINATORS,
+ gettext_noop("Relax enforcing of constraints"),
+ gettext_noop("If enabled then constraints like foreign keys "
+ "are not enforced. It's the users responsibility "
+ "to maintain referential integrity at the application "
+ "level")
},
- &enable_remotelimit,
- true,
+ &loose_constraints,
+ false,
NULL, NULL, NULL
},
#endif
+#endif
{
{"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
gettext_noop("Enables genetic query optimization."),
@@ -1454,7 +1475,11 @@ static struct config_bool ConfigureNamesBool[] =
},
{
+#ifdef XCP
+ {"synchronize_seqscans", PGC_SUSET, COMPAT_OPTIONS_PREVIOUS,
+#else
{"synchronize_seqscans", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS,
+#endif
gettext_noop("Enable synchronized sequential scans."),
NULL
},
@@ -1516,6 +1541,7 @@ static struct config_bool ConfigureNamesBool[] =
NULL, NULL, NULL
},
#ifdef PGXC
+#ifndef XCP
{
{"persistent_datanode_connections", PGC_BACKEND, DEVELOPER_OPTIONS,
gettext_noop("Session never releases acquired connections."),
@@ -1527,6 +1553,15 @@ static struct config_bool ConfigureNamesBool[] =
NULL, NULL, NULL
},
{
+ {"strict_statement_checking", PGC_USERSET, DEVELOPER_OPTIONS,
+ gettext_noop("Forbid statements that are not safe for the cluster"),
+ NULL
+ },
+ &StrictStatementChecking,
+ true,
+ NULL, NULL, NULL
+ },
+ {
{"enforce_two_phase_commit", PGC_SUSET, XC_HOUSEKEEPING_OPTIONS,
gettext_noop("Enforce the use of two-phase commit on transactions that"
"made use of temporary objects"),
@@ -1546,6 +1581,7 @@ static struct config_bool ConfigureNamesBool[] =
check_pgxc_maintenance_mode, NULL, NULL
},
#endif
+#endif
{
{"lo_compat_privileges", PGC_SUSET, COMPAT_OPTIONS_PREVIOUS,
@@ -1750,7 +1786,11 @@ static struct config_int ConfigureNamesInt[] =
},
{
+#ifdef XCP
+ {"temp_buffers", PGC_SUSET, RESOURCES_MEM,
+#else
{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
+#endif
gettext_noop("Sets the maximum number of temporary buffers used by each session."),
NULL,
GUC_UNIT_BLOCKS
@@ -1800,7 +1840,11 @@ static struct config_int ConfigureNamesInt[] =
},
{
+#ifdef XCP
+ {"work_mem", PGC_SUSET, RESOURCES_MEM,
+#else
{"work_mem", PGC_USERSET, RESOURCES_MEM,
+#endif
gettext_noop("Sets the maximum memory to be used for query workspaces."),
gettext_noop("This much memory can be used by each internal "
"sort operation and hash table before switching to "
@@ -1813,7 +1857,11 @@ static struct config_int ConfigureNamesInt[] =
},
{
+#ifdef XCP
+ {"maintenance_work_mem", PGC_SUSET, RESOURCES_MEM,
+#else
{"maintenance_work_mem", PGC_USERSET, RESOURCES_MEM,
+#endif
gettext_noop("Sets the maximum memory to be used for maintenance operations."),
gettext_noop("This includes operations such as VACUUM and CREATE INDEX."),
GUC_UNIT_KB
@@ -2154,7 +2202,11 @@ static struct config_int ConfigureNamesInt[] =
},
{
+#ifdef XCP
+ {"commit_delay", PGC_SUSET, WAL_SETTINGS,
+#else
{"commit_delay", PGC_USERSET, WAL_SETTINGS,
+#endif
gettext_noop("Sets the delay in microseconds between transaction commit and "
"flushing WAL to disk."),
NULL
@@ -2165,7 +2217,11 @@ static struct config_int ConfigureNamesInt[] =
},
{
+#ifdef XCP
+ {"commit_siblings", PGC_SUSET, WAL_SETTINGS,
+#else
{"commit_siblings", PGC_USERSET, WAL_SETTINGS,
+#endif
gettext_noop("Sets the minimum concurrent open transactions before performing "
"commit_delay."),
NULL
@@ -2505,6 +2561,51 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
#ifdef PGXC
+#ifdef XCP
+ {
+ {"sequence_range", PGC_USERSET, COORDINATORS | DATA_NODES,
+ gettext_noop("The range of values to ask from GTM for sequences. "
+ "If CACHE parameter is set then that overrides this."),
+ NULL,
+ },
+ &SequenceRangeVal,
+ 1, 1, INT_MAX,
+ NULL, NULL, NULL
+ },
+
+ {
+ {"pool_conn_keepalive", PGC_SIGHUP, DATA_NODES,
+ gettext_noop("Close connections if they are idle in the pool for that time."),
+ gettext_noop("A value of -1 turns autoclose off."),
+ GUC_UNIT_S
+ },
+ &PoolConnKeepAlive,
+ 600, -1, INT_MAX,
+ NULL, NULL, NULL
+ },
+
+ {
+ {"pool_maintenance_timeout", PGC_SIGHUP, DATA_NODES,
+ gettext_noop("Launch maintenance routine if pooler idle for that time."),
+ gettext_noop("A value of -1 turns feature off."),
+ GUC_UNIT_S
+ },
+ &PoolMaintenanceTimeout,
+ 30, -1, INT_MAX,
+ NULL, NULL, NULL
+ },
+
+ {
+ {"max_pool_size", PGC_SIGHUP, DATA_NODES,
+ gettext_noop("Max pool size."),
+ gettext_noop("If number of active connections reaches this value, "
+ "other connection requests will be refused")
+ },
+ &MaxPoolSize,
+ 100, 1, 65535,
+ NULL, NULL, NULL
+ },
+#else
{
{"min_pool_size", PGC_POSTMASTER, DATA_NODES,
gettext_noop("Initial pool size."),
@@ -2526,6 +2627,7 @@ static struct config_int ConfigureNamesInt[] =
100, 1, 65535,
NULL, NULL, NULL
},
+#endif
{
{"pooler_port", PGC_POSTMASTER, DATA_NODES,
@@ -2568,7 +2670,37 @@ static struct config_int ConfigureNamesInt[] =
16, 2, 65535,
NULL, NULL, NULL
},
+
+#ifdef XCP
+ /*
+ * Shared queues provide shared memory buffers to stream data from
+ * "producer" - process which executes subplan to "consumers" - processes
+ * that are forwarding data to destination data nodes.
+ */
+ {
+ {"shared_queues", PGC_POSTMASTER, RESOURCES_MEM,
+ gettext_noop("Sets the number of shared memory queues used by the distributed executor."),
+ NULL,
+ GUC_UNIT_BLOCKS
+ },
+ &NSQueues,
+ 64, 16, INT_MAX,
+ NULL, NULL, NULL
+ },
+
+ {
+ {"shared_queue_size", PGC_POSTMASTER, RESOURCES_MEM,
+ gettext_noop("Sets the amount of memory allocated for a shared memory queue."),
+ NULL,
+ GUC_UNIT_BLOCKS
+ },
+ &SQueueSize,
+ 64, 16, MAX_KILOBYTES,
+ NULL, NULL, NULL
+ },
#endif
+#endif /* PGXC */
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
@@ -2640,6 +2772,28 @@ static struct config_real ConfigureNamesReal[] =
NULL, NULL, NULL
},
+#ifdef XCP
+ {
+ {"network_byte_cost", PGC_USERSET, QUERY_TUNING_COST,
+ gettext_noop("Sets the planner's estimate of the cost of "
+ "sending data from remote node."),
+ NULL
+ },
+ &network_byte_cost,
+ DEFAULT_NETWORK_BYTE_COST, 0, DBL_MAX, NULL, NULL
+ },
+
+ {
+ {"remote_query_cost", PGC_USERSET, QUERY_TUNING_COST,
+ gettext_noop("Sets the planner's estimate of the cost of "
+ "setting up remote subquery."),
+ NULL
+ },
+ &remote_query_cost,
+ DEFAULT_REMOTE_QUERY_COST, 0, DBL_MAX, NULL, NULL
+ },
+#endif
+
{
{"geqo_selection_bias", PGC_USERSET, QUERY_TUNING_GEQO,
gettext_noop("GEQO: selective pressure within the population."),
@@ -2983,6 +3137,31 @@ static struct config_string ConfigureNamesString[] =
check_session_authorization, assign_session_authorization, NULL
},
+#ifdef XCP
+ {
+ {"global_session", PGC_USERSET, UNGROUPED,
+ gettext_noop("Sets the global session identifier."),
+ NULL,
+ GUC_IS_NAME | GUC_REPORT | GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_NOT_WHILE_SEC_REST
+ },
+ &global_session_string,
+ "none",
+ check_global_session, assign_global_session, NULL
+ },
+
+ {
+ {"pgxc_catalog_remap", PGC_SIGHUP, XC_HOUSEKEEPING_OPTIONS,
+ gettext_noop("List of catalog tables/views that always need to be "
+ "mapped to the storm_catalog."),
+ NULL,
+ GUC_LIST_INPUT | GUC_LIST_QUOTE | GUC_SUPERUSER_ONLY
+ },
+ &storm_catalog_remap_string,
+ "pg_roles, pg_shdescription, pg_database, pg_db_role_setting, pg_tablespace, pg_auth_members, pg_shdepend, pg_stat_database, pg_stat_database_conflicts, pg_stat_activity, pg_locks, pg_prepared_xacts, pg_settings, pg_user, pg_group, pg_shadow, pg_user_mappings, pg_database_size, pg_show_all_settings, pg_stat_get_activity, pg_lock_status",
+ check_storm_catalog_remap_string, NULL, NULL
+ },
+#endif
+
{
{"log_destination", PGC_SIGHUP, LOGGING_WHERE,
gettext_noop("Sets the destination for server log output."),
@@ -3252,6 +3431,17 @@ static struct config_string ConfigureNamesString[] =
NULL, NULL, NULL
},
#endif
+#ifdef XCP
+ {
+ {"parentnode", PGC_BACKEND, CONN_AUTH,
+ gettext_noop("Sets the name of the parent data node"),
+ NULL
+ },
+ &parentPGXCNode,
+ NULL,
+ NULL, NULL, NULL
+ },
+#endif /* XCP */
{
{"ssl_ciphers", PGC_POSTMASTER, CONN_AUTH_SECURITY,
gettext_noop("Sets the list of allowed SSL ciphers."),
@@ -3497,6 +3687,9 @@ static struct config_enum ConfigureNamesEnum[] =
#ifdef PGXC
{
{"remotetype", PGC_BACKEND, CONN_AUTH,
+#ifdef XCP
+ gettext_noop("Sets the type of Postgres-XL remote connection"),
+#endif
gettext_noop("Sets the type of Postgres-XC remote connection"),
NULL
},
@@ -5347,6 +5540,13 @@ set_config_option(const char *name, const char *value,
struct config_generic *record;
bool prohibitValueChange = false;
bool makeDefault;
+#ifdef XCP
+ bool send_to_nodes = false;
+
+ /* Determine now, because source may be changed below in the function */
+ if (source == PGC_S_SESSION && (IS_PGXC_DATANODE || !IsConnFromCoord()))
+ send_to_nodes = true;
+#endif
#ifdef PGXC
/*
@@ -6083,6 +6283,75 @@ set_config_option(const char *name, const char *value,
if (changeVal && (record->flags & GUC_REPORT))
ReportGUCOption(record);
+#ifdef XCP
+ if (send_to_nodes)
+ {
+ RemoteQuery *step;
+ StringInfoData poolcmd;
+
+ initStringInfo(&poolcmd);
+
+ /*
+ * We are getting parse error when sending down
+ * SET transaction_isolation TO read committed;
+ * XXX generic solution?
+ */
+ if (value && strcmp("transaction_isolation", name) == 0)
+ value = quote_identifier(value);
+
+ /*
+ * Quote value if it is including memory or time units
+ */
+ if (value && (record->flags & (GUC_UNIT_MEMORY | GUC_UNIT_TIME)))
+ value = quote_identifier(value);
+
+ /*
+ * Save new parameter value with the node manager.
+ * XXX here we may check: if value equals to configuration default
+ * just reset parameter instead. Minus one table entry, shorter SET
+ * command sent downn... Sounds like optimization.
+ */
+ if (action == GUC_ACTION_LOCAL)
+ {
+ if (IsTransactionBlock())
+ PGXCNodeSetParam(true, name, value);
+ appendStringInfo(&poolcmd, "SET LOCAL %s TO %s", name,
+ (value ? value : "DEFAULT"));
+ }
+ else
+ {
+ PGXCNodeSetParam(false, name, value);
+ appendStringInfo(&poolcmd, "SET %s TO %s", name,
+ (value ? value : "DEFAULT"));
+ }
+
+ /*
+ * Send new value down to remote nodes if any is connected
+ * XXX here we are creatig a node and invoke a function that is trying
+ * to send some. That introduces some overhead, which may seem to be
+ * significant if application sets a bunch of parameters before doing
+ * anything useful - waste work for for each set statement.
+ * We may want to avoid that, by resetting the remote parameters and
+ * flagging that parameters needs to be updated before sending down next
+ * statement.
+ * On the other hand if session runs with a number of customized
+ * parameters and switching one, that would cause all values are resent.
+ * So let's go with "send immediately" approach: parameters are not set
+ * too often to care about overhead here.
+ */
+ step = makeNode(RemoteQuery);
+ step->combine_type = COMBINE_TYPE_SAME;
+ step->exec_nodes = NULL;
+ step->sql_statement = poolcmd.data;
+ /* force_autocommit is actually does not start transaction on nodes */
+ step->force_autocommit = true;
+ step->exec_type = EXEC_ON_CURRENT;
+ ExecRemoteUtility(step);
+ pfree(step);
+ pfree(poolcmd.data);
+ }
+#endif
+
return changeVal ? 1 : -1;
}
@@ -6407,6 +6676,11 @@ ExecSetVariableStmt(VariableSetStmt *stmt)
{
ListCell *head;
+#ifdef XCP
+ /* SET TRANSACTION assumes "local" */
+ stmt->is_local = true;
+#endif
+
foreach(head, stmt->args)
{
DefElem *item = (DefElem *) lfirst(head);
@@ -6429,6 +6703,11 @@ ExecSetVariableStmt(VariableSetStmt *stmt)
{
ListCell *head;
+#ifdef XCP
+ /* SET SESSION CHARACTERISTICS assumes "session" */
+ stmt->is_local = false;
+#endif
+
foreach(head, stmt->args)
{
DefElem *item = (DefElem *) lfirst(head);
@@ -6568,6 +6847,7 @@ set_config_by_name(PG_FUNCTION_ARGS)
#ifdef PGXC
+#ifndef XCP
/*
* Convert this to SET statement and pass it to pooler.
* If command is local and we are not in a transaction block do NOT
@@ -6590,6 +6870,7 @@ set_config_by_name(PG_FUNCTION_ARGS)
}
#endif
+#endif
/* Convert return string to text */
PG_RETURN_TEXT_P(cstring_to_text(new_value));
@@ -8718,6 +8999,7 @@ check_log_stats(bool *newval, void **extra, GucSource source)
}
#ifdef PGXC
+#ifndef XCP
/*
* Only a warning is printed to log.
* Returning false will cause FATAL error and it will not be good.
@@ -8763,6 +9045,7 @@ check_pgxc_maintenance_mode(bool *newval, void **extra, GucSource source)
}
}
#endif
+#endif
static bool
check_canonical_path(char **newval, void **extra, GucSource source)
@@ -9060,4 +9343,73 @@ show_log_file_mode(void)
return buf;
}
+#ifdef XCP
+/*
+ * remove all unwanted spaces from the input, lowercase all the characters and
+ * also add a ',' towards the end if it does not exist. This makes calling
+ * strstr easier on it
+ */
+static bool
+check_storm_catalog_remap_string(char **newval, void **extra, GucSource source)
+{
+ /*
+ * Check syntax. newval must be a comma separated list of identifiers.
+ * Whitespace is allowed but removed from the result.
+ */
+ bool hasSpaceAfterToken = false;
+ const char *cp = *newval;
+ int symLen = 0;
+ char c;
+ StringInfoData buf;
+
+ /* Default NULL is OK */
+ if (cp == NULL)
+ return true;
+
+ initStringInfo(&buf);
+ while ((c = *cp++) != '\0')
+ {
+ if (isspace((unsigned char) c))
+ {
+ if (symLen > 0)
+ hasSpaceAfterToken = true;
+ continue;
+ }
+
+ if (c == ',')
+ {
+ if (symLen > 0) /* terminate identifier */
+ {
+ appendStringInfoChar(&buf, ',');
+ symLen = 0;
+ }
+ hasSpaceAfterToken = false;
+ continue;
+ }
+
+ if (hasSpaceAfterToken)
+ {
+ /*
+ * Syntax error due to token following space after token
+ */
+ pfree(buf.data);
+ return false;
+ }
+ /* We lower case everything */
+ appendStringInfoChar(&buf, pg_tolower(c));
+ symLen++;
+ }
+
+ /* Append ',' at end if not present already */
+ if (symLen != 0 && buf.len > 0)
+ appendStringInfoChar(&buf, ',');
+
+ /* GUC wants the result malloc'd */
+ free(*newval);
+ *newval = guc_strdup(LOG, buf.data);
+
+ pfree(buf.data);
+ return true;
+}
+#endif
#include "guc-file.c"
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 4d9121814b..9bb47f967a 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -152,6 +152,10 @@
#effective_io_concurrency = 1 # 1-1000; 0 disables prefetching
+# - Shared queues -
+
+#shared_queues = 64 # min 16
+#shared_queue_size = 64KB # min 16KB
#------------------------------------------------------------------------------
# WRITE AHEAD LOG
@@ -263,6 +267,8 @@
#cpu_tuple_cost = 0.01 # same scale as above
#cpu_index_tuple_cost = 0.005 # same scale as above
#cpu_operator_cost = 0.0025 # same scale as above
+#network_byte_cost = 0.001 # same scale as above
+#remote_query_cost = 100.0 # same scale as above
#effective_cache_size = 128MB
# - Genetic Query Optimizer -
@@ -570,10 +576,13 @@
#pooler_port = 6667 # Pool Manager TCP port
# (change requires restart)
-#min_pool_size = 1 # Initial pool size
- # (change requires restart)
#max_pool_size = 100 # Maximum pool size
- # (change requires restart)
+#pool_conn_keepalive = 600 # Close connections if they are idle
+ # in the pool for that time
+ # A value of -1 turns autoclose off
+#pool_maintenance_timeout = 30 # Launch maintenance routine if pooler
+ # is idle for that time
+ # A value of -1 turns feature off
#persistent_datanode_connections = off # Set persistent connection mode for pooler
# if set at on, connections taken for session
# are not put back to pool
@@ -598,20 +607,14 @@
##------------------------------------------------------------------------------
# OTHER PG-XC OPTIONS
#------------------------------------------------------------------------------
+#strict_statement_checking = on # Forbid PG-XC-unsafe SQL
+ # Enabling is useful for development
#enforce_two_phase_commit = on # Enforce the usage of two-phase commit on transactions
# where temporary objects are used or ON COMMIT actions
# are pending.
# Usage of commit instead of two-phase commit may break
# data consistency so use at your own risk.
-# - Postgres-XC specific Planner Method Configuration
-
-#enable_fast_query_shipping = on
-#enable_remotejoin = on
-#enable_remotegroup = on
-#enable_remotelimit = on
-#enable_remotesort = on
-
#------------------------------------------------------------------------------
# CUSTOMIZED OPTIONS
#------------------------------------------------------------------------------
diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c
index 6a1858d2a5..2d28c4e2e3 100644
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -8,6 +8,11 @@
* doesn't actually run the executor for them.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -395,6 +400,52 @@ PortalCreateHoldStore(Portal portal)
MemoryContextSwitchTo(oldcxt);
}
+#ifdef XCP
+void
+PortalCreateProducerStore(Portal portal)
+{
+ MemoryContext oldcxt;
+
+ Assert(portal->holdContext == NULL);
+ Assert(portal->holdStore == NULL);
+
+ /*
+ * Create the memory context that is used for storage of the tuple set.
+ * Note this is NOT a child of the portal's heap memory.
+ */
+ portal->holdContext =
+ AllocSetContextCreate(PortalMemory,
+ "PortalHoldContext",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+
+ /*
+ * Create the tuple store, selecting cross-transaction temp files, and
+ * enabling random access only if cursor requires scrolling.
+ *
+ * XXX: Should maintenance_work_mem be used for the portal size?
+ */
+ oldcxt = MemoryContextSwitchTo(portal->holdContext);
+
+ portal->tmpContext = AllocSetContextCreate(portal->holdContext,
+ "TuplestoreTempContext",
+ ALLOCSET_DEFAULT_MINSIZE,
+ ALLOCSET_DEFAULT_INITSIZE,
+ ALLOCSET_DEFAULT_MAXSIZE);
+ /*
+ * We really do not need interXact set to true for the producer store,
+ * but we have to set it as long as we store it in holdStore variable -
+ * portal destroys it after the resource owner invalidating internal
+ * temporary file if tuplestore has been ever spilled to disk
+ */
+ portal->holdStore = tuplestore_begin_datarow(true, work_mem,
+ portal->tmpContext);
+
+ MemoryContextSwitchTo(oldcxt);
+}
+#endif
+
/*
* PinPortal
* Protect a portal from dropping.
@@ -524,6 +575,17 @@ PortalDrop(Portal portal, bool isTopCommit)
/* drop cached plan reference, if any */
PortalReleaseCachedPlan(portal);
+#ifdef XCP
+ /*
+ * Skip memory release if portal is still producining, means has tuples in
+ * local memory, and has to push them to consumers. It would loose the
+ * tuples if free the memory now.
+ * The cleanup should be completed if the portal finished producing.
+ */
+ if (portalIsProducing(portal))
+ return;
+#endif
+
/*
* Release any resources still attached to the portal. There are several
* cases being covered here:
@@ -843,16 +905,16 @@ AtCleanup_Portals(void)
if (portal->portalPinned)
portal->portalPinned = false;
-#ifdef PGXC
+#ifdef PGXC
/* XXX This is a PostgreSQL bug (already reported on the list by
* Pavan). We comment out the assertion until the bug is fixed
* upstream.
- */
+ */
/* We had better not be calling any user-defined code here */
/* Assert(portal->cleanup == NULL); */
#endif
-
+
/* Zap it. */
PortalDrop(portal, false);
}
@@ -992,6 +1054,45 @@ AtSubCleanup_Portals(SubTransactionId mySubid)
}
}
+
+#ifdef XCP
+static List *producingPortals = NIL;
+
+List *
+getProducingPortals(void)
+{
+ return producingPortals;
+}
+
+
+void
+addProducingPortal(Portal portal)
+{
+ MemoryContext save_context;
+
+ save_context = MemoryContextSwitchTo(PortalMemory);
+
+ producingPortals = lappend(producingPortals, portal);
+
+ MemoryContextSwitchTo(save_context);
+}
+
+
+void
+removeProducingPortal(Portal portal)
+{
+ producingPortals = list_delete_ptr(producingPortals, portal);
+}
+
+
+bool
+portalIsProducing(Portal portal)
+{
+ return list_member_ptr(producingPortals, portal);
+}
+#endif
+
+
/* Find all available cursors */
Datum
pg_cursor(PG_FUNCTION_ARGS)
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 765ac4cef7..f43ffb8a97 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -87,6 +87,11 @@
* above. Nonetheless, with large workMem we can have many tapes.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -224,8 +229,12 @@ struct Tuplesortstate
MemoryContext sortcontext; /* memory context holding all sort data */
LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */
#ifdef PGXC
+#ifdef XCP
+ ResponseCombiner *combiner; /* tuple source, alternate to tapeset */
+#else
RemoteQueryState *combiner; /* tuple source, alternate to tapeset */
-#endif
+#endif /* XCP */
+#endif /* PGXC */
/*
* These function pointers decouple the routines that must know what kind
@@ -903,7 +912,11 @@ Tuplesortstate *
tuplesort_begin_merge(TupleDesc tupDesc,
int nkeys, AttrNumber *attNums,
Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags,
+#ifdef XCP
+ ResponseCombiner *combiner,
+#else
RemoteQueryState *combiner,
+#endif
int workMem)
{
Tuplesortstate *state = tuplesort_begin_common(workMem, false);
@@ -2958,23 +2971,101 @@ reversedirection_heap(Tuplesortstate *state)
}
#ifdef PGXC
+#ifdef XCP
+static unsigned int
+getlen_datanode(Tuplesortstate *state, int tapenum, bool eofOK)
+{
+ ResponseCombiner *combiner = state->combiner;
+ TupleTableSlot *dstslot = combiner->ss.ps.ps_ResultTupleSlot;
+ TupleTableSlot *slot;
+
+ combiner->current_conn = tapenum;
+ slot = FetchTuple(combiner);
+ if (TupIsNull(slot))
+ {
+ if (eofOK)
+ return 0;
+ else
+ elog(ERROR, "unexpected end of data");
+ }
+
+ if (slot != dstslot)
+ ExecCopySlot(dstslot, slot);
+
+ return 1;
+}
+
+static void
+readtup_datanode(Tuplesortstate *state, SortTuple *stup,
+ int tapenum, unsigned int len)
+{
+ TupleTableSlot *slot = state->combiner->ss.ps.ps_ResultTupleSlot;
+ MinimalTuple tuple;
+ HeapTupleData htup;
+
+ Assert(!TupIsNull(slot));
+
+ /* copy the tuple into sort storage */
+ tuple = ExecCopySlotMinimalTuple(slot);
+ stup->tuple = (void *) tuple;
+ USEMEM(state, GetMemoryChunkSpace(tuple));
+ /* set up first-column key value */
+ htup.t_len = tuple->t_len + MINIMAL_TUPLE_OFFSET;
+ htup.t_data = (HeapTupleHeader) ((char *) tuple - MINIMAL_TUPLE_OFFSET);
+ stup->datum1 = heap_getattr(&htup,
+ state->sortKeys[0].ssup_attno,
+ state->tupDesc,
+ &stup->isnull1);
+}
+#else
static unsigned int
getlen_datanode(Tuplesortstate *state, int tapenum, bool eofOK)
{
- RemoteQueryState *combiner = state->combiner;
- TupleTableSlot *temp_tts;
+ RemoteQueryState *combiner = state->combiner;
+ PGXCNodeHandle *conn = combiner->connections[tapenum];
+ /*
+ * If connection is active (potentially has data to read) we can get node
+ * number from the connection. If connection is not active (we have read all
+ * available data rows) and if we have buffered data from that connection
+ * the node number is stored in combiner->tapenodes[tapenum].
+ * If connection is inactive and no buffered data we have EOF condition
+ */
+ int nid;
+ unsigned int len = 0;
+ ListCell *lc;
+ ListCell *prev = NULL;
- if (combiner->rqs_tapedata)
- elog(ERROR, "wrong state of datanode tape");
+ /* May it ever happen ?! */
+ if (!conn && !combiner->tapenodes)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node cursor")));
+
+ nid = conn ? PGXCNodeGetNodeId(conn->nodeoid, PGXC_NODE_DATANODE) : combiner->tapenodes[tapenum];
- combiner->rqs_tapenum = tapenum;
- temp_tts = ExecProcNode((PlanState *)combiner);
- if (!TupIsNull(temp_tts))
+ if (nid < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Node id %d is incorrect", nid)));
+
+ /*
+ * If there are buffered rows iterate over them and get first from
+ * the requested tape
+ */
+ foreach (lc, combiner->rowBuffer)
{
- combiner->rqs_tapedata = temp_tts;
- return temp_tts->tts_dataLen;
+ RemoteDataRow dataRow = (RemoteDataRow) lfirst(lc);
+ if (dataRow->msgnode == nid)
+ {
+ combiner->currentRow = *dataRow;
+ combiner->rowBuffer = list_delete_cell(combiner->rowBuffer, lc, prev);
+ return dataRow->msglen;
+ }
+ prev = lc;
}
- else
+
+ /* Nothing is found in the buffer, check for EOF */
+ if (conn == NULL)
{
if (eofOK)
return 0;
@@ -2982,22 +3073,98 @@ getlen_datanode(Tuplesortstate *state, int tapenum, bool eofOK)
elog(ERROR, "unexpected end of data");
}
- /* Keep compiler happy */
- return 0;
+ /* Going to get data from connection, buffer if needed */
+ if (conn->state == DN_CONNECTION_STATE_QUERY && conn->combiner != combiner)
+ BufferConnection(conn);
+
+ /* Request more rows if needed */
+ if (conn->state == DN_CONNECTION_STATE_IDLE)
+ {
+ Assert(combiner->cursor);
+ if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node cursor")));
+ if (pgxc_node_send_sync(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node cursor")));
+ conn->state = DN_CONNECTION_STATE_QUERY;
+ conn->combiner = combiner;
+ }
+ /* Read data from the connection until get a row or EOF */
+ for (;;)
+ {
+ switch (handle_response(conn, combiner))
+ {
+ case RESPONSE_SUSPENDED:
+ /* Send Execute to request next row */
+ Assert(combiner->cursor);
+ if (len)
+ return len;
+ if (pgxc_node_send_execute(conn, combiner->cursor, 1000) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node cursor")));
+ if (pgxc_node_send_sync(conn) != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Failed to fetch from data node cursor")));
+ conn->state = DN_CONNECTION_STATE_QUERY;
+ conn->combiner = combiner;
+ /* fallthru */
+ case RESPONSE_EOF:
+ /* receive more data */
+ if (pgxc_node_receive(1, &conn, NULL))
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg(conn->error)));
+ break;
+ case RESPONSE_COMPLETE:
+ /* EOF encountered, close the tape and report EOF */
+ if (combiner->cursor)
+ {
+ combiner->connections[tapenum] = NULL;
+ if (len)
+ return len;
+ }
+ if (eofOK)
+ return 0;
+ else
+ elog(ERROR, "unexpected end of data");
+ break;
+ case RESPONSE_DATAROW:
+ Assert(len == 0);
+ if (state->combiner->cursor)
+ {
+ /*
+ * We fetching one row at a time when running EQP
+ * so read following PortalSuspended or ResponseComplete
+ * to leave connection clean between the calls
+ */
+ len = state->combiner->currentRow.msglen;
+ break;
+ }
+ else
+ return state->combiner->currentRow.msglen;
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("Unexpected response from the data nodes")));
+ }
+ }
}
static void
readtup_datanode(Tuplesortstate *state, SortTuple *stup,
int tapenum, unsigned int len)
{
- TupleTableSlot *slot = state->combiner->rqs_tapedata;
+ TupleTableSlot *slot = state->combiner->ss.ss_ScanTupleSlot;
MinimalTuple tuple;
HeapTupleData htup;
- Assert(!TupIsNull(slot));
- if (slot->tts_dataLen != len)
- elog(ERROR, "Expected a tuple with length %d but got one with length %d",
- len, slot->tts_dataLen);
+ FetchTuple(state->combiner, slot);
+
/* copy the tuple into sort storage */
tuple = ExecCopySlotMinimalTuple(slot);
stup->tuple = (void *) tuple;
@@ -3009,10 +3176,9 @@ readtup_datanode(Tuplesortstate *state, SortTuple *stup,
state->sortKeys[0].ssup_attno,
state->tupDesc,
&stup->isnull1);
- /* Reset the buffer for next read */
- state->combiner->rqs_tapedata = NULL;
}
-#endif
+#endif /* XCP */
+#endif /* PGXC */
/*
* Routines specialized for the CLUSTER case (HeapTuple data, with
diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c
index 8a7931b856..9f064144f0 100644
--- a/src/backend/utils/sort/tuplestore.c
+++ b/src/backend/utils/sort/tuplestore.c
@@ -43,6 +43,11 @@
* before switching to the other state or activating a different read pointer.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -72,6 +77,27 @@ typedef enum
TSS_READFILE /* Reading from temp file */
} TupStoreStatus;
+
+#ifdef XCP
+/*
+ * Supported tuplestore formats
+ */
+typedef enum
+{
+ TSF_MINIMAL, /* Minimal tuples */
+ TSF_DATAROW, /* Datarow tuples */
+ TSF_MESSAGE /* A Postgres protocol message data */
+} TupStoreFormat;
+
+
+typedef struct
+{
+ int32 msglen;
+ char *msg;
+} msg_data;
+#endif
+
+
/*
* State for a single read pointer. If we are in state INMEM then all the
* read pointers' "current" fields denote the read positions. In state
@@ -99,6 +125,9 @@ typedef struct
struct Tuplestorestate
{
TupStoreStatus status; /* enumerated value as shown above */
+#ifdef XCP
+ TupStoreFormat format; /* enumerated value as shown above */
+#endif
int eflags; /* capability flags (OR of pointers' flags) */
bool backward; /* store extra length words in file? */
bool interXact; /* keep open through transactions? */
@@ -106,6 +135,9 @@ struct Tuplestorestate
long availMem; /* remaining memory available, in bytes */
BufFile *myfile; /* underlying file, or NULL if none */
MemoryContext context; /* memory context for holding tuples */
+#ifdef XCP
+ MemoryContext tmpcxt; /* memory context for holding temporary data */
+#endif
ResourceOwner resowner; /* resowner for holding temp files */
/*
@@ -171,6 +203,12 @@ struct Tuplestorestate
int writepos_file; /* file# (valid if READFILE state) */
off_t writepos_offset; /* offset (valid if READFILE state) */
+
+ char *stat_name;
+ long stat_read_count;
+ long stat_write_count;
+ long stat_spill_read;
+ long stat_spill_write;
};
#define COPYTUP(state,tup) ((*(state)->copytup) (state, tup))
@@ -235,7 +273,14 @@ static unsigned int getlen(Tuplestorestate *state, bool eofOK);
static void *copytup_heap(Tuplestorestate *state, void *tup);
static void writetup_heap(Tuplestorestate *state, void *tup);
static void *readtup_heap(Tuplestorestate *state, unsigned int len);
-
+#ifdef XCP
+static void *copytup_datarow(Tuplestorestate *state, void *tup);
+static void writetup_datarow(Tuplestorestate *state, void *tup);
+static void *readtup_datarow(Tuplestorestate *state, unsigned int len);
+static void *copytup_message(Tuplestorestate *state, void *tup);
+static void writetup_message(Tuplestorestate *state, void *tup);
+static void *readtup_message(Tuplestorestate *state, unsigned int len);
+#endif
/*
* tuplestore_begin_xxx
@@ -275,6 +320,12 @@ tuplestore_begin_common(int eflags, bool interXact, int maxKBytes)
state->readptrs[0].eof_reached = false;
state->readptrs[0].current = 0;
+ state->stat_name = NULL;
+ state->stat_write_count = 0;
+ state->stat_read_count = 0;
+ state->stat_spill_write = 0;
+ state->stat_spill_read = 0;
+
return state;
}
@@ -313,9 +364,15 @@ tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes)
state = tuplestore_begin_common(eflags, interXact, maxKBytes);
+#ifdef XCP
+ state->format = TSF_MINIMAL;
+#endif
state->copytup = copytup_heap;
state->writetup = writetup_heap;
state->readtup = readtup_heap;
+#ifdef XCP
+ state->tmpcxt = NULL;
+#endif
return state;
}
@@ -436,6 +493,16 @@ tuplestore_end(Tuplestorestate *state)
{
int i;
+ if (state->stat_name)
+ {
+ elog(LOG, "Tuplestore %s did %ld writes and %ld reads, "
+ "it spilled to disk after %ld writes and %ld reads, "
+ "now deleted %d memtuples out of %d", state->stat_name,
+ state->stat_write_count, state->stat_read_count,
+ state->stat_spill_write, state->stat_spill_read,
+ state->memtupdeleted, state->memtupcount);
+ }
+
if (state->myfile)
BufFileClose(state->myfile);
if (state->memtuples)
@@ -548,6 +615,10 @@ tuplestore_puttupleslot(Tuplestorestate *state,
MinimalTuple tuple;
MemoryContext oldcxt = MemoryContextSwitchTo(state->context);
+#ifdef XCP
+ if (state->format == TSF_MINIMAL)
+ {
+#endif
/*
* Form a MinimalTuple in working memory
*/
@@ -555,6 +626,20 @@ tuplestore_puttupleslot(Tuplestorestate *state,
USEMEM(state, GetMemoryChunkSpace(tuple));
tuplestore_puttuple_common(state, (void *) tuple);
+#ifdef XCP
+ }
+ else if (state->format == TSF_DATAROW)
+ {
+ RemoteDataRow tuple = ExecCopySlotDatarow(slot, state->tmpcxt);
+ USEMEM(state, GetMemoryChunkSpace(tuple));
+
+ tuplestore_puttuple_common(state, (void *) tuple);
+ }
+ else
+ {
+ elog(ERROR, "Unsupported datastore format");
+ }
+#endif
MemoryContextSwitchTo(oldcxt);
}
@@ -568,6 +653,10 @@ tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple)
{
MemoryContext oldcxt = MemoryContextSwitchTo(state->context);
+#ifdef XCP
+ Assert(state->format == TSF_MINIMAL);
+#endif
+
/*
* Copy the tuple. (Must do this even in WRITEFILE case. Note that
* COPYTUP includes USEMEM, so we needn't do that here.)
@@ -590,6 +679,10 @@ tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc,
MinimalTuple tuple;
MemoryContext oldcxt = MemoryContextSwitchTo(state->context);
+#ifdef XCP
+ Assert(state->format == TSF_MINIMAL);
+#endif
+
tuple = heap_form_minimal_tuple(tdesc, values, isnull);
USEMEM(state, GetMemoryChunkSpace(tuple));
@@ -605,6 +698,9 @@ tuplestore_puttuple_common(Tuplestorestate *state, void *tuple)
int i;
ResourceOwner oldowner;
+ if (state->stat_name)
+ state->stat_write_count++;
+
switch (state->status)
{
case TSS_INMEM:
@@ -655,6 +751,12 @@ tuplestore_puttuple_common(Tuplestorestate *state, void *tuple)
if (state->memtupcount < state->memtupsize && !LACKMEM(state))
return;
+ if (state->stat_name)
+ {
+ state->stat_spill_read = state->stat_read_count;
+ state->stat_spill_write = state->stat_write_count;
+ }
+
/*
* Nope; time to switch to tape-based operation. Make sure that
* the temp file(s) are created in suitable temp tablespaces.
@@ -764,6 +866,9 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward,
return NULL;
if (readptr->current < state->memtupcount)
{
+ if (state->stat_name)
+ state->stat_read_count++;
+
/* We have another tuple, so return it */
return state->memtuples[readptr->current++];
}
@@ -795,6 +900,9 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward,
Assert(!state->truncated);
return NULL;
}
+ if (state->stat_name)
+ state->stat_read_count++;
+
return state->memtuples[readptr->current - 1];
}
break;
@@ -824,6 +932,9 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward,
if ((tuplen = getlen(state, true)) != 0)
{
tup = READTUP(state, tuplen);
+ if (state->stat_name && tup)
+ state->stat_read_count++;
+
return tup;
}
else
@@ -892,6 +1003,9 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward,
SEEK_CUR) != 0)
elog(ERROR, "bogus tuple length in backward scan");
tup = READTUP(state, tuplen);
+ if (state->stat_name && tup)
+ state->stat_read_count++;
+
return tup;
default:
@@ -924,12 +1038,37 @@ tuplestore_gettupleslot(Tuplestorestate *state, bool forward,
if (tuple)
{
+#ifdef XCP
+ if (state->format == TSF_MINIMAL)
+ {
+#endif
if (copy && !should_free)
{
tuple = heap_copy_minimal_tuple(tuple);
should_free = true;
}
ExecStoreMinimalTuple(tuple, slot, should_free);
+#ifdef XCP
+ }
+ else if (state->format == TSF_DATAROW)
+ {
+ RemoteDataRow datarow = (RemoteDataRow) tuple;
+ if (copy && !should_free)
+ {
+ RemoteDataRow dup = (RemoteDataRow) palloc(sizeof(RemoteDataRowData) + datarow->msglen);
+ dup->msgnode = datarow->msgnode;
+ dup->msglen = datarow->msglen;
+ memcpy(dup->msg, datarow->msg, datarow->msglen);
+ datarow = dup;
+ should_free = true;
+ }
+ ExecStoreDataRowTuple(datarow, slot, should_free);
+ }
+ else
+ {
+ elog(ERROR, "Unsupported datastore format");
+ }
+#endif
return true;
}
else
@@ -1311,3 +1450,218 @@ readtup_heap(Tuplestorestate *state, unsigned int len)
elog(ERROR, "unexpected end of data");
return (void *) tuple;
}
+
+
+#ifdef XCP
+/*
+ * Routines to support Datarow tuple format, used for exchange between nodes
+ * as well as send data to client
+ */
+Tuplestorestate *
+tuplestore_begin_datarow(bool interXact, int maxKBytes,
+ MemoryContext tmpcxt)
+{
+ Tuplestorestate *state;
+
+ state = tuplestore_begin_common(0, interXact, maxKBytes);
+
+ state->format = TSF_DATAROW;
+ state->copytup = copytup_datarow;
+ state->writetup = writetup_datarow;
+ state->readtup = readtup_datarow;
+ state->tmpcxt = tmpcxt;
+
+ return state;
+}
+
+
+/*
+ * Do we need this at all?
+ */
+static void *
+copytup_datarow(Tuplestorestate *state, void *tup)
+{
+ Assert(false);
+ return NULL;
+}
+
+static void
+writetup_datarow(Tuplestorestate *state, void *tup)
+{
+ RemoteDataRow tuple = (RemoteDataRow) tup;
+
+ /* the part of the MinimalTuple we'll write: */
+ char *tupbody = tuple->msg;
+ unsigned int tupbodylen = tuple->msglen;
+
+ /* total on-disk footprint: */
+ unsigned int tuplen = tupbodylen + sizeof(int) + sizeof(tuple->msgnode);
+
+ if (BufFileWrite(state->myfile, (void *) &tuplen,
+ sizeof(int)) != sizeof(int))
+ elog(ERROR, "write failed");
+ if (BufFileWrite(state->myfile, (void *) &tuple->msgnode,
+ sizeof(tuple->msgnode)) != sizeof(tuple->msgnode))
+ elog(ERROR, "write failed");
+ if (BufFileWrite(state->myfile, (void *) tupbody,
+ tupbodylen) != (size_t) tupbodylen)
+ elog(ERROR, "write failed");
+ if (state->backward) /* need trailing length word? */
+ if (BufFileWrite(state->myfile, (void *) &tuplen,
+ sizeof(tuplen)) != sizeof(tuplen))
+ elog(ERROR, "write failed");
+
+ FREEMEM(state, GetMemoryChunkSpace(tuple));
+ pfree(tuple);
+}
+
+static void *
+readtup_datarow(Tuplestorestate *state, unsigned int len)
+{
+ RemoteDataRow tuple = (RemoteDataRow) palloc(len);
+ unsigned int tupbodylen = len - sizeof(int) - sizeof(tuple->msgnode);
+
+ USEMEM(state, GetMemoryChunkSpace(tuple));
+ /* read in the tuple proper */
+ tuple->msglen = tupbodylen;
+ if (BufFileRead(state->myfile, (void *) &tuple->msgnode,
+ sizeof(tuple->msgnode)) != sizeof(tuple->msgnode))
+ elog(ERROR, "unexpected end of data");
+ if (BufFileRead(state->myfile, (void *) tuple->msg,
+ tupbodylen) != (size_t) tupbodylen)
+ elog(ERROR, "unexpected end of data");
+ if (state->backward) /* need trailing length word? */
+ if (BufFileRead(state->myfile, (void *) &len,
+ sizeof(len)) != sizeof(len))
+ elog(ERROR, "unexpected end of data");
+ return (void *) tuple;
+}
+
+
+/*
+ * Routines to support storage of protocol message data
+ */
+Tuplestorestate *
+tuplestore_begin_message(bool interXact, int maxKBytes)
+{
+ Tuplestorestate *state;
+
+ state = tuplestore_begin_common(0, interXact, maxKBytes);
+
+ state->format = TSF_MESSAGE;
+ state->copytup = copytup_message;
+ state->writetup = writetup_message;
+ state->readtup = readtup_message;
+ state->tmpcxt = NULL;
+
+ return state;
+}
+
+
+void
+tuplestore_putmessage(Tuplestorestate *state, int len, char* msg)
+{
+ msg_data m;
+ void *tuple;
+ MemoryContext oldcxt = MemoryContextSwitchTo(state->context);
+
+ Assert(state->format == TSF_MESSAGE);
+
+ m.msglen = len;
+ m.msg = msg;
+
+ tuple = COPYTUP(state, &m);
+ tuplestore_puttuple_common(state, tuple);
+
+ MemoryContextSwitchTo(oldcxt);
+}
+
+
+char *
+tuplestore_getmessage(Tuplestorestate *state, int *len)
+{
+ bool should_free;
+ void *result;
+ void *tuple = tuplestore_gettuple(state, true, &should_free);
+
+ Assert(state->format == TSF_MESSAGE);
+
+ /* done? */
+ if (!tuple)
+ return NULL;
+
+ *len = *((int *) tuple);
+
+ result = palloc(*len);
+ memcpy(result, ((char *) tuple) + sizeof(int), *len);
+ if (should_free)
+ pfree(tuple);
+
+ return (char *) result;
+}
+
+
+static void *
+copytup_message(Tuplestorestate *state, void *tup)
+{
+ msg_data *m = (msg_data *) tup;
+ void *tuple;
+
+ tuple = palloc(m->msglen + sizeof(int));
+ *((int *) tuple) = m->msglen;
+ memcpy(((char *) tuple) + sizeof(int), m->msg, m->msglen);
+ USEMEM(state, GetMemoryChunkSpace(tuple));
+ return tuple;
+}
+
+
+static void
+writetup_message(Tuplestorestate *state, void *tup)
+{
+ int *msglen = (int *) tup;
+ /* total on-disk footprint: */
+ unsigned int tuplen = *msglen;
+
+ if (BufFileWrite(state->myfile, tup, tuplen) != tuplen)
+ elog(ERROR, "write failed");
+ if (state->backward) /* need trailing length word? */
+ if (BufFileWrite(state->myfile, (void *) &tuplen,
+ sizeof(tuplen)) != sizeof(tuplen))
+ elog(ERROR, "write failed");
+
+ FREEMEM(state, GetMemoryChunkSpace(tup));
+ pfree(tup);
+}
+
+static void *
+readtup_message(Tuplestorestate *state, unsigned int len)
+{
+ void *tuple = palloc(len + sizeof(int));
+ *((int *) tuple) = len;
+
+ USEMEM(state, GetMemoryChunkSpace(tuple));
+ /* read in the tuple proper */
+ if (BufFileRead(state->myfile, ((char *) tuple) + sizeof(int),
+ len) != (size_t) len)
+ elog(ERROR, "unexpected end of data");
+ if (state->backward) /* need trailing length word? */
+ if (BufFileRead(state->myfile, (void *) &len,
+ sizeof(len)) != sizeof(len))
+ elog(ERROR, "unexpected end of data");
+ return tuple;
+}
+#endif
+
+
+void
+tuplestore_collect_stat(Tuplestorestate *state, char *name)
+{
+ if (state->status != TSS_INMEM || state->memtupcount != 0)
+ {
+ elog(WARNING, "tuplestore %s is already in use, to late to get statistics",
+ name);
+ return;
+ }
+
+ state->stat_name = pstrdup(name);
+}
diff --git a/src/backend/utils/time/combocid.c b/src/backend/utils/time/combocid.c
index 5429922d3f..30182ceea6 100644
--- a/src/backend/utils/time/combocid.c
+++ b/src/backend/utils/time/combocid.c
@@ -30,6 +30,11 @@
* destroyed at the end of each transaction.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -270,6 +275,25 @@ GetComboCommandId(CommandId cmin, CommandId cmax)
static CommandId
GetRealCmin(CommandId combocid)
{
+#ifdef XCP
+ /*
+ * Workaround against assertion failure (or segmentation fault if
+ * assertions is disabled) in a secondary datanode session when trying
+ * to check visibility of a tuple with ComboCID.
+ * ComboCID is only valid in a session that did the update, that is the
+ * primary session.
+ * Ideally we should have a solution, how to share ComboCIDs
+ * between session just make tuples with ComboCIDs invisible to secondary
+ * processes. Until then, we will have visibility issues in rare cases,
+ * if in the same transaction:
+ * 1. Tuples inserted
+ * 2. Cursor is opened
+ * 3. Tuples inserted in step 1 are deleted
+ *
+ */
+ if (combocid >= usedComboCids)
+ return FirstCommandId - 1;
+#endif
Assert(combocid < usedComboCids);
return comboCids[combocid].cmin;
}
@@ -277,6 +301,19 @@ GetRealCmin(CommandId combocid)
static CommandId
GetRealCmax(CommandId combocid)
{
+#ifdef XCP
+ /*
+ * Ugly workaround against assertion failure (or segmentation fault if
+ * assertions is disabled) in a secondary datanode session when trying
+ * to check visibility of a tuple with ComboCID.
+ * ComboCID is only valid in a session that did the update, that is the
+ * primary session. Until we come up with a solution, how to share ComboCIDs
+ * between session just make tuples with ComboCIDs invisible to secondary
+ * processes.
+ */
+ if (combocid >= usedComboCids)
+ return FirstCommandId;
+#endif
Assert(combocid < usedComboCids);
return comboCids[combocid].cmax;
}
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 2899b94142..c55b947833 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -27,6 +27,11 @@
* for too long.)
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -205,7 +210,11 @@ GetTransactionSnapshot(void)
* The command id should therefore be updated in the
* current snapshot.
*/
+#ifdef XCP
+ if (IsConnFromCoord() || IsConnFromDatanode())
+#else
if (IsConnFromCoord())
+#endif
SnapshotSetCommandId(GetCurrentCommandId(false));
#endif
return CurrentSnapshot;
diff --git a/src/bin/Makefile b/src/bin/Makefile
index b02c3caca6..7498395022 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -13,7 +13,7 @@ subdir = src/bin
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
-SUBDIRS = gtm_ctl initdb initgtm pg_ctl pg_dump \
+SUBDIRS = initdb initgtm pg_ctl pg_dump \
psql scripts pg_config pg_controldata pg_resetxlog pg_basebackup
ifeq ($(PORTNAME), win32)
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 8ff3a0036c..75c58f760a 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -38,6 +38,11 @@
*
* This code is released under the terms of the PostgreSQL License.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
* Portions Copyright (c) 1994, Regents of the University of California
@@ -136,6 +141,9 @@ static char *conf_file;
static char *conversion_file;
static char *dictionary_file;
static char *info_schema_file;
+#ifdef XCP
+static char *storm_cat_file;
+#endif
static char *features_file;
static char *system_views_file;
static bool made_new_pgdata = false;
@@ -216,6 +224,9 @@ static void setup_dictionary(void);
static void setup_privileges(void);
static void set_info_version(void);
static void setup_schema(void);
+#ifdef XCP
+static void setup_storm(void);
+#endif
static void load_plpgsql(void);
static void vacuum_db(void);
static void make_template0(void);
@@ -1591,7 +1602,11 @@ setup_description(void)
PG_CMD_PRINTF1("COPY tmp_pg_shdescription FROM E'%s';\n",
escape_quotes(shdesc_file));
+#ifdef XCP
+ PG_CMD_PUTS("INSERT INTO pg_catalog.pg_shdescription "
+#else
PG_CMD_PUTS("INSERT INTO pg_shdescription "
+#endif
" SELECT t.objoid, c.oid, t.description "
" FROM tmp_pg_shdescription t, pg_class c "
" WHERE c.relname = t.classname;\n");
@@ -1890,6 +1905,9 @@ setup_privileges(void)
" WHERE relkind IN ('r', 'v', 'S') AND relacl IS NULL;\n",
"GRANT USAGE ON SCHEMA pg_catalog TO PUBLIC;\n",
"GRANT CREATE, USAGE ON SCHEMA public TO PUBLIC;\n",
+#ifdef XCP
+ "GRANT USAGE ON SCHEMA storm_catalog TO PUBLIC;\n",
+#endif
"REVOKE ALL ON pg_largeobject FROM PUBLIC;\n",
NULL
};
@@ -2000,6 +2018,46 @@ setup_schema(void)
check_ok();
}
+#ifdef XCP
+/*
+ * load storm catalog and populate from features file
+ */
+static void
+setup_storm(void)
+{
+ PG_CMD_DECL;
+ char **line;
+ char **lines;
+
+ fputs(_("creating storm catalog... "), stdout);
+ fflush(stdout);
+
+ lines = readfile(storm_cat_file);
+
+ /*
+ * We use -j here to avoid backslashing stuff in storm_catalog.sql
+ */
+ snprintf(cmd, sizeof(cmd),
+ "\"%s\" %s -j template1 >%s",
+ backend_exec, backend_options,
+ DEVNULL);
+
+ PG_CMD_OPEN;
+
+ for (line = lines; *line != NULL; line++)
+ {
+ PG_CMD_PUTS(*line);
+ free(*line);
+ }
+
+ free(lines);
+
+ PG_CMD_CLOSE;
+
+ check_ok();
+}
+#endif
+
/*
* load PL/pgsql server-side language
*/
@@ -2090,7 +2148,11 @@ make_template0(void)
const char **line;
static const char *template0_setup[] = {
"CREATE DATABASE template0;\n",
+#ifdef XCP
+ "UPDATE pg_catalog.pg_database SET "
+#else
"UPDATE pg_database SET "
+#endif
" datistemplate = 't', "
" datallowconn = 'f' "
" WHERE datname = 'template0';\n",
@@ -2098,8 +2160,13 @@ make_template0(void)
/*
* We use the OID of template0 to determine lastsysoid
*/
+#ifdef XCP
+ "UPDATE pg_catalog.pg_database SET datlastsysoid = "
+ " (SELECT oid FROM pg_catalog.pg_database "
+#else
"UPDATE pg_database SET datlastsysoid = "
" (SELECT oid FROM pg_database "
+#endif
" WHERE datname = 'template0');\n",
/*
@@ -2115,7 +2182,11 @@ make_template0(void)
/*
* Finally vacuum to clean up dead rows in pg_database
*/
+#ifdef XCP
+ "VACUUM FULL pg_catalog.pg_database;\n",
+#else
"VACUUM FULL pg_database;\n",
+#endif
NULL
};
@@ -2593,8 +2664,12 @@ usage(const char *progname)
printf(_(" --auth-local=METHOD default authentication method for local-socket connections\n"));
printf(_(" [-D, --pgdata=]DATADIR location for this database cluster\n"));
#ifdef PGXC
+#ifdef XCP
+ printf(_(" --nodename=NODENAME name of Postgres-XL node initialized\n"));
+#else
printf(_(" --nodename=NODENAME name of Postgres-XC node initialized\n"));
#endif
+#endif
printf(_(" -E, --encoding=ENCODING set default encoding for new databases\n"));
printf(_(" --locale=LOCALE set default locale for new databases\n"));
printf(_(" --lc-collate=, --lc-ctype=, --lc-messages=LOCALE\n"
@@ -2877,7 +2952,11 @@ main(int argc, char *argv[])
#ifdef PGXC
if (!nodename)
{
+#ifdef XCP
+ fprintf(stderr, _("%s: Postgres-XL node name is mandatory\n"), progname);
+#else
fprintf(stderr, _("%s: Postgres-XC node name is mandatory\n"), progname);
+#endif
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
progname);
exit(1);
@@ -3025,6 +3104,9 @@ main(int argc, char *argv[])
set_input(&conversion_file, "conversion_create.sql");
set_input(&dictionary_file, "snowball_create.sql");
set_input(&info_schema_file, "information_schema.sql");
+#ifdef XCP
+ set_input(&storm_cat_file, "storm_catalog.sql");
+#endif
set_input(&features_file, "sql_features.txt");
set_input(&system_views_file, "system_views.sql");
@@ -3058,6 +3140,9 @@ main(int argc, char *argv[])
check_input(conversion_file);
check_input(dictionary_file);
check_input(info_schema_file);
+#ifdef XCP
+ check_input(storm_cat_file);
+#endif
check_input(features_file);
check_input(system_views_file);
@@ -3402,6 +3487,10 @@ main(int argc, char *argv[])
load_plpgsql();
+#ifdef XCP
+ setup_storm();
+#endif
+
vacuum_db();
make_template0();
@@ -3423,11 +3512,19 @@ main(int argc, char *argv[])
#ifdef PGXC
+#ifdef XCP
+ printf(_("\nSuccess.\n You can now start the database server of the Postgres-XL coordinator using:\n\n"
+#else
printf(_("\nSuccess.\n You can now start the database server of the Postgres-XC coordinator using:\n\n"
+#endif
" %s%s%spostgres%s --coordinator -D %s%s%s\n"
"or\n"
" %s%s%spg_ctl%s start -D %s%s%s -Z coordinator -l logfile\n\n"
+#ifdef XCP
+ " You can now start the database server of the Postgres-XL datanode using:\n\n"
+#else
" You can now start the database server of the Postgres-XC datanode using:\n\n"
+#endif
" %s%s%spostgres%s --datanode -D %s%s%s\n"
"or \n"
" %s%s%spg_ctl%s start -D %s%s%s -Z datanode -l logfile\n\n"),
diff --git a/src/bin/initgtm/initgtm.c b/src/bin/initgtm/initgtm.c
index 57856e0f2e..d779fff6b9 100644
--- a/src/bin/initgtm/initgtm.c
+++ b/src/bin/initgtm/initgtm.c
@@ -772,7 +772,11 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo)
static void
usage(const char *progname)
{
+#ifdef XCP
+ printf(_("%s initializes GTM for a Postgres-XL database cluster.\n\n"), progname);
+#else
printf(_("%s initializes a GTM for Postgres-XC database cluster.\n\n"), progname);
+#endif
printf(_("Usage:\n"));
printf(_(" %s [NODE-TYPE] [OPTION]... [DATADIR]\n"), progname);
printf(_("\nOptions:\n"));
@@ -823,7 +827,11 @@ main(int argc, char *argv[])
}
if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
{
+#ifdef XCP
+ puts("initgtm (Postgres-XL) " PGXC_VERSION);
+#else
puts("initgtm (Postgres-XC) " PGXC_VERSION);
+#endif
exit(0);
}
}
diff --git a/src/bin/pg_basebackup/streamutil.c b/src/bin/pg_basebackup/streamutil.c
index 1b4a9d240b..e5b3ee06c2 100644
--- a/src/bin/pg_basebackup/streamutil.c
+++ b/src/bin/pg_basebackup/streamutil.c
@@ -154,7 +154,7 @@ GetConnection(void)
if (PQstatus(tmpconn) != CONNECTION_OK)
{
- fprintf(stderr, _("%s: could not connect to server: %s\n"),
+ fprintf(stderr, _("%s: could not connect to server: %s"),
progname, PQerrorMessage(tmpconn));
return NULL;
}
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index 0eb8084053..cf3d0e1d28 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -61,8 +61,8 @@ typedef enum
NO_COMMAND = 0,
INIT_COMMAND,
START_COMMAND,
- RESTART_COMMAND,
STOP_COMMAND,
+ RESTART_COMMAND,
RELOAD_COMMAND,
STATUS_COMMAND,
PROMOTE_COMMAND,
@@ -1791,11 +1791,15 @@ do_help(void)
printf(_(" -t, --timeout=SECS seconds to wait when using -w option\n"));
printf(_(" -w wait until operation completes\n"));
printf(_(" -W do not wait until operation completes\n"));
+ printf(_(" --help show this help, then exit\n"));
+ printf(_(" --version output version information, then exit\n"));
#ifdef PGXC
+#ifdef XCP
+ printf(_(" -Z NODE-TYPE can be \"coordinator\" or \"datanode\" (Postgres-XL)\n"));
+#else
printf(_(" -Z NODE-TYPE can be \"coordinator\" or \"datanode\" (Postgres-XC)\n"));
#endif
- printf(_(" --help show this help, then exit\n"));
- printf(_(" --version output version information, then exit\n"));
+#endif
printf(_("(The default is to wait for shutdown, but not for start or restart.)\n\n"));
printf(_("If the -D option is omitted, the environment variable PGDATA is used.\n"));
@@ -2110,6 +2114,8 @@ main(int argc, char **argv)
pgxcCommand = strdup("--coordinator");
else if (strcmp(optarg, "datanode") == 0)
pgxcCommand = strdup("--datanode");
+ else if (strcmp(optarg, "restoremode") == 0)
+ pgxcCommand = strdup("--restoremode");
#endif
case 's':
silent_mode = true;
@@ -2298,12 +2304,12 @@ main(int argc, char **argv)
case START_COMMAND:
do_start();
break;
- case RESTART_COMMAND:
- do_restart();
- break;
case STOP_COMMAND:
do_stop();
break;
+ case RESTART_COMMAND:
+ do_restart();
+ break;
case RELOAD_COMMAND:
do_reload();
break;
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 539bcb9167..0637563a31 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -4,6 +4,11 @@
* pg_dump is a utility for dumping out a postgres database
* into a script file.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -137,7 +142,9 @@ static int column_inserts = 0;
static int no_security_labels = 0;
static int no_unlogged_table_data = 0;
static int serializable_deferrable = 0;
-
+#ifdef PGXC
+static int include_nodes = 0;
+#endif
static void help(const char *progname);
static void setup_connection(Archive *AH, const char *dumpencoding,
@@ -190,6 +197,7 @@ static void dumpTable(Archive *fout, TableInfo *tbinfo);
static void dumpTableSchema(Archive *fout, TableInfo *tbinfo);
static void dumpAttrDef(Archive *fout, AttrDefInfo *adinfo);
static void dumpSequence(Archive *fout, TableInfo *tbinfo);
+static void dumpSequenceData(Archive *fout, TableDataInfo *tdinfo);
static void dumpIndex(Archive *fout, IndxInfo *indxinfo);
static void dumpConstraint(Archive *fout, ConstraintInfo *coninfo);
static void dumpTableConstraintComment(Archive *fout, ConstraintInfo *coninfo);
@@ -340,6 +348,9 @@ main(int argc, char **argv)
{"use-set-session-authorization", no_argument, &use_setsessauth, 1},
{"no-security-labels", no_argument, &no_security_labels, 1},
{"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1},
+#ifdef PGXC
+ {"include-nodes", no_argument, &include_nodes, 1},
+#endif
{NULL, 0, NULL, 0}
};
@@ -816,6 +827,9 @@ help(const char *progname)
printf(_(" --use-set-session-authorization\n"
" use SET SESSION AUTHORIZATION commands instead of\n"
" ALTER OWNER commands to set ownership\n"));
+#ifdef PGXC
+ printf(_(" --include-nodes include TO NODE clause in the dumped CREATE TABLE commands\n"));
+#endif
printf(_("\nConnection options:\n"));
printf(_(" -h, --host=HOSTNAME database server host or socket directory\n"));
@@ -1049,6 +1063,9 @@ selectDumpableNamespace(NamespaceInfo *nsinfo)
nsinfo->dobj.dump = simple_oid_list_member(&schema_include_oids,
nsinfo->dobj.catId.oid);
else if (strncmp(nsinfo->dobj.name, "pg_", 3) == 0 ||
+#ifdef XCP
+ strncmp(nsinfo->dobj.name, "storm_", 6) == 0 ||
+#endif
strcmp(nsinfo->dobj.name, "information_schema") == 0)
nsinfo->dobj.dump = false;
else
@@ -1808,6 +1825,23 @@ dumpDatabase(Archive *fout)
selectSourceSchema(fout, "pg_catalog");
/* Get the database owner and parameters from pg_database */
+#ifdef XCP
+ if (fout->remoteVersion >= 90100)
+ {
+ appendPQExpBuffer(dbQry, "SELECT 1262::oid as tableoid, oid, "
+ "(%s datdba) AS dba, "
+ "pg_encoding_to_char(encoding) AS encoding, "
+ "datcollate, datctype, datfrozenxid, "
+ "(SELECT spcname FROM pg_tablespace t WHERE t.oid = dattablespace) AS tablespace, "
+ "shobj_description(oid, 'pg_database') AS description "
+
+ "FROM pg_database "
+ "WHERE datname = ",
+ username_subquery);
+ appendStringLiteralAH(dbQry, datname, fout);
+ }
+ else
+#endif
if (fout->remoteVersion >= 80400)
{
appendPQExpBuffer(dbQry, "SELECT tableoid, oid, "
@@ -3832,6 +3866,7 @@ getTables(Archive *fout, int *numTables)
#ifdef PGXC
int i_pgxclocatortype;
int i_pgxcattnum;
+ int i_pgxc_node_names;
#endif
int i_reltablespace;
int i_reloptions;
@@ -3883,6 +3918,7 @@ getTables(Archive *fout, int *numTables)
#ifdef PGXC
"(SELECT pclocatortype from pgxc_class v where v.pcrelid = c.oid) AS pgxclocatortype,"
"(SELECT pcattnum from pgxc_class v where v.pcrelid = c.oid) AS pgxcattnum,"
+ "(SELECT string_agg(node_name,',') AS pgxc_node_names from pgxc_node n where n.oid in (select unnest(nodeoids) from pgxc_class v where v.pcrelid=c.oid) ) , "
#endif
"array_to_string(c.reloptions, ', ') AS reloptions, "
"array_to_string(array(SELECT 'toast.' || x FROM unnest(tc.reloptions) x), ', ') AS toast_reloptions "
@@ -3941,8 +3977,6 @@ getTables(Archive *fout, int *numTables)
/*
* Left join to pick up dependency info linking sequences to their
* owning column, if any (note this dependency is AUTO as of 8.2)
- * PGXC is based on PostgreSQL version 8.4, it is not necessary to
- * to modify the other SQL queries.
*/
appendPQExpBuffer(query,
"SELECT c.tableoid, c.oid, c.relname, "
@@ -3957,7 +3991,7 @@ getTables(Archive *fout, int *numTables)
"d.refobjid AS owning_tab, "
"d.refobjsubid AS owning_col, "
"(SELECT spcname FROM pg_tablespace t WHERE t.oid = c.reltablespace) AS reltablespace, "
- "array_to_string(c.reloptions, ', ') AS reloptions, "
+ "array_to_string(c.reloptions, ', ') AS reloptions, "
"array_to_string(array(SELECT 'toast.' || x FROM unnest(tc.reloptions) x), ', ') AS toast_reloptions "
"FROM pg_class c "
"LEFT JOIN pg_depend d ON "
@@ -4204,6 +4238,7 @@ getTables(Archive *fout, int *numTables)
#ifdef PGXC
i_pgxclocatortype = PQfnumber(res, "pgxclocatortype");
i_pgxcattnum = PQfnumber(res, "pgxcattnum");
+ i_pgxc_node_names = PQfnumber(res, "pgxc_node_names");
#endif
i_reltablespace = PQfnumber(res, "reltablespace");
i_reloptions = PQfnumber(res, "reloptions");
@@ -4274,6 +4309,7 @@ getTables(Archive *fout, int *numTables)
tblinfo[i].pgxclocatortype = *(PQgetvalue(res, i, i_pgxclocatortype));
tblinfo[i].pgxcattnum = atoi(PQgetvalue(res, i, i_pgxcattnum));
}
+ tblinfo[i].pgxc_node_names = pg_strdup(PQgetvalue(res, i, i_pgxc_node_names));
#endif
tblinfo[i].reltablespace = pg_strdup(PQgetvalue(res, i, i_reltablespace));
tblinfo[i].reloptions = pg_strdup(PQgetvalue(res, i, i_reloptions));
@@ -7174,7 +7210,10 @@ dumpDumpableObject(Archive *fout, DumpableObject *dobj)
dumpCast(fout, (CastInfo *) dobj);
break;
case DO_TABLE_DATA:
- dumpTableData(fout, (TableDataInfo *) dobj);
+ if (((TableDataInfo *) dobj)->tdtable->relkind == RELKIND_SEQUENCE)
+ dumpSequenceData(fout, (TableDataInfo *) dobj);
+ else
+ dumpTableData(fout, (TableDataInfo *) dobj);
break;
case DO_DUMMY_TYPE:
/* table rowtypes and array types are never dumped separately */
@@ -12489,6 +12528,12 @@ dumpTableSchema(Archive *fout, TableInfo *tbinfo)
fmtId(tbinfo->attnames[hashkey - 1]));
}
}
+ if (include_nodes &&
+ tbinfo->pgxc_node_names != NULL &&
+ tbinfo->pgxc_node_names[0] != '\0')
+ {
+ appendPQExpBuffer(q, "\nTO NODE (%s)", tbinfo->pgxc_node_names);
+ }
#endif
/* Dump generic options if any */
if (ftoptions && ftoptions[0])
@@ -13446,34 +13491,6 @@ dumpSequence(Archive *fout, TableInfo *tbinfo)
if (!schemaOnly)
{
-#ifdef PGXC
- /*
- * In Postgres-XC it is possible that the current value of a
- * sequence cached on each node is different as several sessions
- * might use the sequence on different nodes. So what we do here
- * to get a consistent dump is to get the next value of sequence.
- * This insures that sequence value is unique as nextval is directly
- * obtained from GTM.
- */
- resetPQExpBuffer(query);
- appendPQExpBuffer(query, "SELECT pg_catalog.nextval(");
- appendStringLiteralAH(query, fmtId(tbinfo->dobj.name), fout);
- appendPQExpBuffer(query, ");\n");
- res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
-
- if (PQntuples(res) != 1)
- {
- write_msg(NULL, ngettext("query to get nextval of sequence \"%s\" "
- "returned %d rows (expected 1)\n",
- "query to get nextval of sequence \"%s\" "
- "returned %d rows (expected 1)\n",
- PQntuples(res)),
- tbinfo->dobj.name, PQntuples(res));
- exit_nicely(1);
- }
-
- last = PQgetvalue(res, 0, 0);
-#endif
resetPQExpBuffer(query);
appendPQExpBuffer(query, "SELECT pg_catalog.setval(");
appendStringLiteralAH(query, fmtId(tbinfo->dobj.name), fout);
@@ -13498,6 +13515,88 @@ dumpSequence(Archive *fout, TableInfo *tbinfo)
destroyPQExpBuffer(labelq);
}
+/*
+ * dumpSequenceData
+ * write the data of one user-defined sequence
+ */
+static void
+dumpSequenceData(Archive *fout, TableDataInfo *tdinfo)
+{
+ TableInfo *tbinfo = tdinfo->tdtable;
+ PGresult *res;
+ char *last;
+ bool called;
+ PQExpBuffer query = createPQExpBuffer();
+
+ /* Make sure we are in proper schema */
+ selectSourceSchema(fout, tbinfo->dobj.namespace->dobj.name);
+
+ appendPQExpBuffer(query,
+ "SELECT last_value, is_called FROM %s",
+ fmtId(tbinfo->dobj.name));
+
+ res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
+
+ if (PQntuples(res) != 1)
+ {
+ write_msg(NULL, ngettext("query to get data of sequence \"%s\" returned %d row (expected 1)\n",
+ "query to get data of sequence \"%s\" returned %d rows (expected 1)\n",
+ PQntuples(res)),
+ tbinfo->dobj.name, PQntuples(res));
+ exit_nicely(1);
+ }
+
+ last = PQgetvalue(res, 0, 0);
+ called = (strcmp(PQgetvalue(res, 0, 1), "t") == 0);
+#ifdef PGXC
+ /*
+ * In Postgres-XC it is possible that the current value of a
+ * sequence cached on each node is different as several sessions
+ * might use the sequence on different nodes. So what we do here
+ * to get a consistent dump is to get the next value of sequence.
+ * This insures that sequence value is unique as nextval is directly
+ * obtained from GTM.
+ */
+ resetPQExpBuffer(query);
+ appendPQExpBuffer(query, "SELECT pg_catalog.nextval(");
+ appendStringLiteralAH(query, fmtId(tbinfo->dobj.name), fout);
+ appendPQExpBuffer(query, ");\n");
+ res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
+
+ if (PQntuples(res) != 1)
+ {
+ write_msg(NULL, ngettext("query to get nextval of sequence \"%s\" "
+ "returned %d rows (expected 1)\n",
+ "query to get nextval of sequence \"%s\" "
+ "returned %d rows (expected 1)\n",
+ PQntuples(res)),
+ tbinfo->dobj.name, PQntuples(res));
+ exit_nicely(1);
+ }
+
+ last = PQgetvalue(res, 0, 0);
+#endif
+ resetPQExpBuffer(query);
+ appendPQExpBuffer(query, "SELECT pg_catalog.setval(");
+ appendStringLiteralAH(query, fmtId(tbinfo->dobj.name), fout);
+ appendPQExpBuffer(query, ", %s, %s);\n",
+ last, (called ? "true" : "false"));
+
+ ArchiveEntry(fout, nilCatalogId, createDumpId(),
+ tbinfo->dobj.name,
+ tbinfo->dobj.namespace->dobj.name,
+ NULL,
+ tbinfo->rolname,
+ false, "SEQUENCE SET", SECTION_DATA,
+ query->data, "", NULL,
+ &(tbinfo->dobj.dumpId), 1,
+ NULL, NULL);
+
+ PQclear(res);
+
+ destroyPQExpBuffer(query);
+}
+
static void
dumpTrigger(Archive *fout, TriggerInfo *tginfo)
{
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h
index e52fb8319b..b48a32d12a 100644
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -263,6 +263,7 @@ typedef struct _tableInfo
/* PGXC table locator Data */
char pgxclocatortype; /* Type of PGXC table locator */
int pgxcattnum; /* Number of the attribute the table is partitioned with */
+ char *pgxc_node_names; /* List of node names where this table is distributed */
#endif
/*
* These fields are computed only if we decide the table is interesting
diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c
index 053e5fd36a..c90de1b18d 100644
--- a/src/bin/pg_dump/pg_dumpall.c
+++ b/src/bin/pg_dump/pg_dumpall.c
@@ -59,6 +59,11 @@ static PGconn *connectDatabase(const char *dbname, const char *pghost, const cha
static PGresult *executeQuery(PGconn *conn, const char *query);
static void executeCommand(PGconn *conn, const char *query);
+#ifdef PGXC
+static void dumpNodes(PGconn *conn);
+static void dumpNodeGroups(PGconn *conn);
+#endif /* PGXC */
+
static char pg_dump_bin[MAXPGPATH];
static PQExpBuffer pgdumpopts;
static bool skip_acls = false;
@@ -78,6 +83,10 @@ static int server_version;
static FILE *OPF;
static char *filename = NULL;
+#ifdef PGXC
+static int dump_nodes = 0;
+static int include_nodes = 0;
+#endif /* PGXC */
int
main(int argc, char *argv[])
@@ -138,7 +147,10 @@ main(int argc, char *argv[])
{"use-set-session-authorization", no_argument, &use_setsessauth, 1},
{"no-security-labels", no_argument, &no_security_labels, 1},
{"no-unlogged-table-data", no_argument, &no_unlogged_table_data, 1},
-
+#ifdef PGXC
+ {"dump-nodes", no_argument, &dump_nodes, 1},
+ {"include-nodes", no_argument, &include_nodes, 1},
+#endif
{NULL, 0, NULL, 0}
};
@@ -360,6 +372,11 @@ main(int argc, char *argv[])
if (no_unlogged_table_data)
appendPQExpBuffer(pgdumpopts, " --no-unlogged-table-data");
+#ifdef PGXC
+ if (include_nodes)
+ appendPQExpBuffer(pgdumpopts, " --include-nodes");
+#endif
+
/*
* If there was a database specified on the command line, use that,
* otherwise try to connect to database "postgres", and failing that
@@ -511,6 +528,15 @@ main(int argc, char *argv[])
if (server_version >= 90000)
dumpDbRoleConfig(conn);
}
+
+#ifdef PGXC
+ /* Dump nodes and node groups */
+ if (dump_nodes)
+ {
+ dumpNodes(conn);
+ dumpNodeGroups(conn);
+ }
+#endif
}
if (!globals_only && !roles_only && !tablespaces_only)
@@ -564,6 +590,10 @@ help(void)
printf(_(" --use-set-session-authorization\n"
" use SET SESSION AUTHORIZATION commands instead of\n"
" ALTER OWNER commands to set ownership\n"));
+#ifdef PGXC
+ printf(_(" --dump-nodes include nodes and node groups in the dump\n"));
+ printf(_(" --include-nodes include TO NODE clause in the dumped CREATE TABLE commands\n"));
+#endif
printf(_("\nConnection options:\n"));
printf(_(" -h, --host=HOSTNAME database server host or socket directory\n"));
@@ -1918,3 +1948,76 @@ doShellQuoting(PQExpBuffer buf, const char *str)
appendPQExpBufferChar(buf, '"');
#endif /* WIN32 */
}
+
+#ifdef PGXC
+static void
+dumpNodes(PGconn *conn)
+{
+ PQExpBuffer query;
+ PGresult *res;
+ int num;
+ int i;
+
+ query = createPQExpBuffer();
+
+ appendPQExpBuffer(query, "select 'CREATE NODE ' || node_name || '"
+ " WITH (TYPE = ' || chr(39) || (case when node_type='C'"
+ " then 'coordinator' else 'datanode' end) || chr(39)"
+ " || ' , HOST = ' || chr(39) || node_host || chr(39)"
+ " || ', PORT = ' || node_port || (case when nodeis_primary='t'"
+ " then ', PRIMARY' else ' ' end) || (case when nodeis_preferred"
+ " then ', PREFERRED' else ' ' end) || ');' "
+ " as node_query from pg_catalog.pgxc_node order by oid");
+
+ res = executeQuery(conn, query->data);
+
+ num = PQntuples(res);
+
+ if (num > 0)
+ fprintf(OPF, "--\n-- Nodes\n--\n\n");
+
+ for (i = 0; i < num; i++)
+ {
+ fprintf(OPF, "%s\n", PQgetvalue(res, i, PQfnumber(res, "node_query")));
+ }
+ fprintf(OPF, "\n");
+
+ PQclear(res);
+ destroyPQExpBuffer(query);
+}
+
+static void
+dumpNodeGroups(PGconn *conn)
+{
+ PQExpBuffer query;
+ PGresult *res;
+ int num;
+ int i;
+
+ query = createPQExpBuffer();
+
+ appendPQExpBuffer(query,
+ "select 'CREATE NODE GROUP ' || pgxc_group.group_name"
+ " || ' WITH(' || string_agg(node_name,',') || ');'"
+ " as group_query from pg_catalog.pgxc_node, pg_catalog.pgxc_group"
+ " where pgxc_node.oid = any (pgxc_group.group_members)"
+ " group by pgxc_group.group_name"
+ " order by pgxc_group.group_name");
+
+ res = executeQuery(conn, query->data);
+
+ num = PQntuples(res);
+
+ if (num > 0)
+ fprintf(OPF, "--\n-- Node groups\n--\n\n");
+
+ for (i = 0; i < num; i++)
+ {
+ fprintf(OPF, "%s\n", PQgetvalue(res, i, PQfnumber(res, "group_query")));
+ }
+ fprintf(OPF, "\n");
+
+ PQclear(res);
+ destroyPQExpBuffer(query);
+}
+#endif
diff --git a/src/bin/pg_resetxlog/po/sv.po b/src/bin/pg_resetxlog/po/sv.po
deleted file mode 100644
index 16e6e051c8..0000000000
--- a/src/bin/pg_resetxlog/po/sv.po
+++ /dev/null
@@ -1,463 +0,0 @@
-# Swedish message translation file for resetxlog.
-# Dennis Bj�rklund <[email protected]>, 2002, 2003, 2004, 2005, 2006.
-# Peter Eisentraut <[email protected]>, 2010.
-#
-msgid ""
-msgstr ""
-"Project-Id-Version: PostgreSQL 9.0\n"
-"Report-Msgid-Bugs-To: [email protected]\n"
-"POT-Creation-Date: 2010-07-02 05:22+0000\n"
-"PO-Revision-Date: 2010-07-02 20:32-0400\n"
-"Last-Translator: Peter Eisentraut <[email protected]>\n"
-"Language-Team: Swedish <[email protected]>\n"
-"MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=ISO-8859-1\n"
-"Content-Transfer-Encoding: 8bit\n"
-
-#: pg_resetxlog.c:135
-#, c-format
-msgid "%s: invalid argument for option -e\n"
-msgstr "%s: felaktigt argument till flagga -e\n"
-
-#: pg_resetxlog.c:136 pg_resetxlog.c:151 pg_resetxlog.c:166 pg_resetxlog.c:181
-#: pg_resetxlog.c:196 pg_resetxlog.c:211 pg_resetxlog.c:218 pg_resetxlog.c:225
-#: pg_resetxlog.c:231 pg_resetxlog.c:239
-#, c-format
-msgid "Try \"%s --help\" for more information.\n"
-msgstr "F�rs�k med \"%s --help\" f�r mer information.\n"
-
-#: pg_resetxlog.c:141
-#, c-format
-msgid "%s: transaction ID epoch (-e) must not be -1\n"
-msgstr "%s: transaktions-ID epoch (-e) f�r inte vara -1\n"
-
-#: pg_resetxlog.c:150
-#, c-format
-msgid "%s: invalid argument for option -x\n"
-msgstr "%s: ogiltigt argument till flaggan -x\n"
-
-#: pg_resetxlog.c:156
-#, c-format
-msgid "%s: transaction ID (-x) must not be 0\n"
-msgstr "%s: transaktions-ID (-x) f�r inte vara 0\n"
-
-#: pg_resetxlog.c:165
-#, c-format
-msgid "%s: invalid argument for option -o\n"
-msgstr "%s: ogiltigt argument till flaggan -o\n"
-
-#: pg_resetxlog.c:171
-#, c-format
-msgid "%s: OID (-o) must not be 0\n"
-msgstr "%s: OID (-o) f�r inte vara 0\n"
-
-#: pg_resetxlog.c:180
-#, c-format
-msgid "%s: invalid argument for option -m\n"
-msgstr "%s: ogiltigt argument till flaggan -m\n"
-
-#: pg_resetxlog.c:186
-#, c-format
-msgid "%s: multitransaction ID (-m) must not be 0\n"
-msgstr "%s: multitransaktions-ID (-m) f�r inte vara 0\n"
-
-#: pg_resetxlog.c:195
-#, c-format
-msgid "%s: invalid argument for option -O\n"
-msgstr "%s: ogiltigt argument till flaggan -O\n"
-
-#: pg_resetxlog.c:201
-#, c-format
-msgid "%s: multitransaction offset (-O) must not be -1\n"
-msgstr "%s: multitransaktionsoffset (-O) f�r inte vara -1\n"
-
-#: pg_resetxlog.c:210 pg_resetxlog.c:217 pg_resetxlog.c:224
-#, c-format
-msgid "%s: invalid argument for option -l\n"
-msgstr "%s: ogiltigt argument till flaggan -l\n"
-
-#: pg_resetxlog.c:238
-#, c-format
-msgid "%s: no data directory specified\n"
-msgstr "%s: ingen datakatalog angiven\n"
-
-#: pg_resetxlog.c:252
-#, c-format
-msgid "%s: cannot be executed by \"root\"\n"
-msgstr "%s: kan inte exekveras av \"root\"\n"
-
-#: pg_resetxlog.c:254
-#, c-format
-msgid "You must run %s as the PostgreSQL superuser.\n"
-msgstr "Du m�ste k�ra %s som PostgreSQLs superanv�ndare.\n"
-
-#: pg_resetxlog.c:264
-#, c-format
-msgid "%s: could not change directory to \"%s\": %s\n"
-msgstr "%s: kunde byta katalog till \"%s\": %s\n"
-
-#: pg_resetxlog.c:279 pg_resetxlog.c:407
-#, c-format
-msgid "%s: could not open file \"%s\" for reading: %s\n"
-msgstr "%s: kunde inte �ppna fil \"%s\" f�r l�sning: %s\n"
-
-#: pg_resetxlog.c:285
-#, c-format
-msgid ""
-"%s: lock file \"%s\" exists\n"
-"Is a server running? If not, delete the lock file and try again.\n"
-msgstr ""
-"%s: l�sfil \"%s\" existerar\n"
-"K�r servern redan? Om inte, radera l�sfilen och f�rs�k igen.\n"
-
-#: pg_resetxlog.c:355
-#, c-format
-msgid ""
-"\n"
-"If these values seem acceptable, use -f to force reset.\n"
-msgstr ""
-"\n"
-"Om dessa v�rden verkar acceptable, anv�nd -f f�r\n"
-"att forcera �terst�llande.\n"
-
-#: pg_resetxlog.c:367
-#, c-format
-msgid ""
-"The database server was not shut down cleanly.\n"
-"Resetting the transaction log might cause data to be lost.\n"
-"If you want to proceed anyway, use -f to force reset.\n"
-msgstr ""
-"Databasservern st�ngdes inte ner korrekt. Att �terst�lla\n"
-"transaktionsloggen kan medf�ra att data f�rloras.\n"
-"Om du vill forts�tta �nd�, anv�nd -f f�r att forcera\n"
-"�terst�llande.\n"
-
-#: pg_resetxlog.c:381
-#, c-format
-msgid "Transaction log reset\n"
-msgstr "�terst�llande fr�n transaktionslogg\n"
-
-#: pg_resetxlog.c:410
-#, c-format
-msgid ""
-"If you are sure the data directory path is correct, execute\n"
-" touch %s\n"
-"and try again.\n"
-msgstr ""
-"Om du �r s�ker p� att datakatalogs�kv�gen �r korrekt s� g�r\n"
-" touch %s\n"
-"och f�rs�k igen.\n"
-
-#: pg_resetxlog.c:423
-#, c-format
-msgid "%s: could not read file \"%s\": %s\n"
-msgstr "%s: kunde inte l�sa fil \"%s\": %s\n"
-
-#: pg_resetxlog.c:446
-#, c-format
-msgid "%s: pg_control exists but has invalid CRC; proceed with caution\n"
-msgstr ""
-"%s: pg_control existerar men har ogiltig CRC; forts�tt med f�rsiktighet\n"
-
-#: pg_resetxlog.c:455
-#, c-format
-msgid "%s: pg_control exists but is broken or unknown version; ignoring it\n"
-msgstr ""
-"%s: pg_control existerar men �r trasig eller har ok�nd version; ignorerar "
-"den\n"
-
-#: pg_resetxlog.c:549
-#, c-format
-msgid ""
-"Guessed pg_control values:\n"
-"\n"
-msgstr ""
-"Gissade pg_control-v�rden:\n"
-"\n"
-
-#: pg_resetxlog.c:551
-#, c-format
-msgid ""
-"pg_control values:\n"
-"\n"
-msgstr ""
-"pg_control-v�rden:\n"
-"\n"
-
-#: pg_resetxlog.c:560
-#, c-format
-msgid "First log file ID after reset: %u\n"
-msgstr "F�rsta loggfil efter nollst�llning: %u\n"
-
-#: pg_resetxlog.c:562
-#, c-format
-msgid "First log file segment after reset: %u\n"
-msgstr "F�rsta loggfilsegment efter nollst.: %u\n"
-
-#: pg_resetxlog.c:564
-#, c-format
-msgid "pg_control version number: %u\n"
-msgstr "pg_control versionsnummer: %u\n"
-
-#: pg_resetxlog.c:566
-#, c-format
-msgid "Catalog version number: %u\n"
-msgstr "Katalogversionsnummer: %u\n"
-
-#: pg_resetxlog.c:568
-#, c-format
-msgid "Database system identifier: %s\n"
-msgstr "Databasens systemidentifierare: %s\n"
-
-#: pg_resetxlog.c:570
-#, c-format
-msgid "Latest checkpoint's TimeLineID: %u\n"
-msgstr "Senaste kontrollpunktens TimeLineID: %u\n"
-
-#: pg_resetxlog.c:572
-#, c-format
-msgid "Latest checkpoint's NextXID: %u/%u\n"
-msgstr "Senaste kontrollpunktens NextXID: %u/%u\n"
-
-#: pg_resetxlog.c:575
-#, c-format
-msgid "Latest checkpoint's NextOID: %u\n"
-msgstr "Senaste kontrollpunktens NextOID: %u\n"
-
-# FIXME: too wide
-#: pg_resetxlog.c:577
-#, c-format
-msgid "Latest checkpoint's NextMultiXactId: %u\n"
-msgstr "Senaste kontrollpunktens NextMultiXactId: %u\n"
-
-#: pg_resetxlog.c:579
-#, c-format
-msgid "Latest checkpoint's NextMultiOffset: %u\n"
-msgstr "Senaste kontrollpunktens NextMultiOffset: %u\n"
-
-#: pg_resetxlog.c:581
-#, c-format
-msgid "Latest checkpoint's oldestXID: %u\n"
-msgstr "Senaste kontrollpunktens oldestXID: %u\n"
-
-# FIXME: too wide
-#: pg_resetxlog.c:583
-#, c-format
-msgid "Latest checkpoint's oldestXID's DB: %u\n"
-msgstr "Senaste kontrollpunktens oldestXID:s DB: %u\n"
-
-# FIXME: too wide
-#: pg_resetxlog.c:585
-#, c-format
-msgid "Latest checkpoint's oldestActiveXID: %u\n"
-msgstr "Senaste kontrollpunktens oldestActiveXID: %u\n"
-
-#: pg_resetxlog.c:587
-#, c-format
-msgid "Maximum data alignment: %u\n"
-msgstr "Maximal data-alignment: %u\n"
-
-#: pg_resetxlog.c:590
-#, c-format
-msgid "Database block size: %u\n"
-msgstr "Databasens blockstorlek: %u\n"
-
-#: pg_resetxlog.c:592
-#, c-format
-msgid "Blocks per segment of large relation: %u\n"
-msgstr "Block per segment i stor relation: %u\n"
-
-#: pg_resetxlog.c:594
-#, c-format
-msgid "WAL block size: %u\n"
-msgstr "WAL-blockstorlek: %u\n"
-
-#: pg_resetxlog.c:596
-#, c-format
-msgid "Bytes per WAL segment: %u\n"
-msgstr "Bytes per WAL-segment: %u\n"
-
-#: pg_resetxlog.c:598
-#, c-format
-msgid "Maximum length of identifiers: %u\n"
-msgstr "Maximal l�ngd p� identifierare: %u\n"
-
-#: pg_resetxlog.c:600
-#, c-format
-msgid "Maximum columns in an index: %u\n"
-msgstr "Maximalt antal kolumner i index: %u\n"
-
-#: pg_resetxlog.c:602
-#, c-format
-msgid "Maximum size of a TOAST chunk: %u\n"
-msgstr "Maximal storlek p� TOAST-bit: %u\n"
-
-#: pg_resetxlog.c:604
-#, c-format
-msgid "Date/time type storage: %s\n"
-msgstr "Lagringstyp f�r datum/tid: %s\n"
-
-#: pg_resetxlog.c:605
-msgid "64-bit integers"
-msgstr "64-bits heltal"
-
-#: pg_resetxlog.c:605
-msgid "floating-point numbers"
-msgstr "flyttalsnummer"
-
-#: pg_resetxlog.c:606
-#, fuzzy, c-format
-msgid "Float4 argument passing: %s\n"
-msgstr "Maximal data-alignment: %u\n"
-
-#: pg_resetxlog.c:607 pg_resetxlog.c:609
-msgid "by value"
-msgstr ""
-
-#: pg_resetxlog.c:607 pg_resetxlog.c:609
-msgid "by reference"
-msgstr ""
-
-#: pg_resetxlog.c:608
-#, fuzzy, c-format
-msgid "Float8 argument passing: %s\n"
-msgstr "Maximal data-alignment: %u\n"
-
-#: pg_resetxlog.c:671
-#, c-format
-msgid ""
-"%s: internal error -- sizeof(ControlFileData) is too large ... fix "
-"PG_CONTROL_SIZE\n"
-msgstr ""
-"%s: internt fel -- sizeof(ControlFileData) �r f�r stor ... r�tt till "
-"PG_CONTROL_SIZE\n"
-
-#: pg_resetxlog.c:686
-#, c-format
-msgid "%s: could not create pg_control file: %s\n"
-msgstr "%s: kunde inte skapa pg_control-fil: %s\n"
-
-#: pg_resetxlog.c:697
-#, c-format
-msgid "%s: could not write pg_control file: %s\n"
-msgstr "%s: kunde inte skriva pg_control-fil: %s\n"
-
-#: pg_resetxlog.c:704 pg_resetxlog.c:1011
-#, c-format
-msgid "%s: fsync error: %s\n"
-msgstr "%s: fsync fel: %s\n"
-
-#: pg_resetxlog.c:742 pg_resetxlog.c:817 pg_resetxlog.c:873
-#, c-format
-msgid "%s: could not open directory \"%s\": %s\n"
-msgstr "%s: kunde inte �ppna katalog \"%s\": %s\n"
-
-#: pg_resetxlog.c:786 pg_resetxlog.c:850 pg_resetxlog.c:907
-#, c-format
-msgid "%s: could not read from directory \"%s\": %s\n"
-msgstr "%s: kunde inte l�sa fr�n katalog \"%s\": %s\n"
-
-#: pg_resetxlog.c:831 pg_resetxlog.c:888
-#, c-format
-msgid "%s: could not delete file \"%s\": %s\n"
-msgstr "%s: kunde inte radera filen \"%s\": %s\n"
-
-#: pg_resetxlog.c:978
-#, c-format
-msgid "%s: could not open file \"%s\": %s\n"
-msgstr "%s: kunde inte �ppna fil \"%s\": %s\n"
-
-#: pg_resetxlog.c:989 pg_resetxlog.c:1003
-#, c-format
-msgid "%s: could not write file \"%s\": %s\n"
-msgstr "%s: kunde inte skriva fil \"%s\": %s\n"
-
-#: pg_resetxlog.c:1022
-#, c-format
-msgid ""
-"%s resets the PostgreSQL transaction log.\n"
-"\n"
-msgstr ""
-"%s �terst�ller PostgreSQL transaktionslogg.\n"
-"\n"
-
-#: pg_resetxlog.c:1023
-#, c-format
-msgid ""
-"Usage:\n"
-" %s [OPTION]... DATADIR\n"
-"\n"
-msgstr ""
-"Anv�ndning:\n"
-" %s [FLAGGA]... DATAKATALOG\n"
-"\n"
-
-#: pg_resetxlog.c:1024
-#, c-format
-msgid "Options:\n"
-msgstr "Flaggor:\n"
-
-#: pg_resetxlog.c:1025
-#, c-format
-msgid " -e XIDEPOCH set next transaction ID epoch\n"
-msgstr " -x XIDEPOCH s�tt n�sta transaktions-ID-epoch\n"
-
-#: pg_resetxlog.c:1026
-#, c-format
-msgid " -f force update to be done\n"
-msgstr " -f forcera �terst�llande\n"
-
-#: pg_resetxlog.c:1027
-#, c-format
-msgid ""
-" -l TLI,FILE,SEG force minimum WAL starting location for new transaction "
-"log\n"
-msgstr ""
-" -l TLI,FILID,SEG ange minsta WAL-startposition f�r ny transaktion\n"
-
-#: pg_resetxlog.c:1028
-#, c-format
-msgid " -m XID set next multitransaction ID\n"
-msgstr " -m XID s�tt n�sta multitransaktions-ID\n"
-
-#: pg_resetxlog.c:1029
-#, c-format
-msgid ""
-" -n no update, just show extracted control values (for "
-"testing)\n"
-msgstr ""
-" -n ingen updatering, visa bara kontrollv�rden (f�r testning)\n"
-
-#: pg_resetxlog.c:1030
-#, c-format
-msgid " -o OID set next OID\n"
-msgstr " -o OID s�tt n�sta OID\n"
-
-#: pg_resetxlog.c:1031
-#, c-format
-msgid " -O OFFSET set next multitransaction offset\n"
-msgstr " -O OFFSET s�tt n�sta multitransaktionsoffset\n"
-
-#: pg_resetxlog.c:1032
-#, c-format
-msgid " -x XID set next transaction ID\n"
-msgstr " -x XID s�tt n�sta transaktions-ID\n"
-
-#: pg_resetxlog.c:1033
-#, c-format
-msgid " --help show this help, then exit\n"
-msgstr " --help visa denna hj�lp, avsluta sedan\n"
-
-#: pg_resetxlog.c:1034
-#, c-format
-msgid " --version output version information, then exit\n"
-msgstr " --version visa versionsinformation, avsluta sedan\n"
-
-#: pg_resetxlog.c:1035
-#, c-format
-msgid ""
-"\n"
-"Report bugs to <[email protected]>.\n"
-msgstr ""
-"\n"
-"Reportera fel till <[email protected]>.\n"
diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c
index 72b60e40b0..1cc9553571 100644
--- a/src/bin/psql/command.c
+++ b/src/bin/psql/command.c
@@ -1677,7 +1677,11 @@ connection_warnings(bool in_startup)
/* For version match, only print psql banner on startup. */
else if (in_startup)
#ifdef PGXC
+#ifdef XCP
+ printf("%s (PGXL %s, based on PG %s)\n", pset.progname, PGXC_VERSION, PG_VERSION);
+#else
printf("%s (PGXC %s, based on PG %s)\n", pset.progname, PGXC_VERSION, PG_VERSION);
+#endif
#else
printf("%s (%s)\n", pset.progname, PG_VERSION);
#endif
diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c
index 8a0beca3c9..6d2216b650 100644
--- a/src/bin/psql/startup.c
+++ b/src/bin/psql/startup.c
@@ -652,7 +652,11 @@ static void
showVersion(void)
{
#ifdef PGXC
+#ifdef XCP
+ puts("psql (Postgres-XL) " PGXC_VERSION);
+#else
puts("psql (Postgres-XC) " PGXC_VERSION);
+#endif
puts("(based on PostgreSQL) " PG_VERSION);
#else
puts("psql (PostgreSQL) " PG_VERSION);
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 63052c5f0c..216b3e796a 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -700,7 +700,10 @@ static const pgsql_thing_t words_after_create[] = {
{"TEMP", NULL, NULL, THING_NO_DROP}, /* for CREATE TEMP TABLE ... */
{"TEMPLATE", Query_for_list_of_ts_templates, NULL, THING_NO_SHOW},
{"TEXT SEARCH", NULL, NULL},
+#ifndef PGXC
+ /* PGXCTODO: This should be re-enabled once TRIGGER is supported */
{"TRIGGER", "SELECT pg_catalog.quote_ident(tgname) FROM pg_catalog.pg_trigger WHERE substring(pg_catalog.quote_ident(tgname),1,%d)='%s'"},
+#endif
{"TYPE", NULL, &Query_for_list_of_datatypes},
{"UNIQUE", NULL, NULL, THING_NO_DROP}, /* for CREATE UNIQUE INDEX ... */
{"UNLOGGED", NULL, NULL, THING_NO_DROP}, /* for CREATE UNLOGGED TABLE
@@ -787,7 +790,7 @@ psql_completion(char *text, int start, int end)
static const char *const sql_commands[] = {
#ifdef PGXC
- /*
+ /*
* Added "CLEAN" and "EXECUTE DIRECT"
* Removed LISTEN, NOTIFY, RELEASE, SAVEPOINT and UNLISTEN
*/
@@ -886,13 +889,13 @@ psql_completion(char *text, int start, int end)
/*
* Added: "NODE" (NODE NAME cannot be altered).
* Removed: "FOREIGN DATA WRAPPER", "FOREIGN TABLE", "LARGE OBJECT",
- * "SERVER", "USER MAPPING FOR".
+ * "SERVER", "TRIGGER", "USER MAPPING FOR".
*/
{"AGGREGATE", "COLLATION", "CONVERSION", "DATABASE", "DEFAULT PRIVILEGES", "DOMAIN",
"EXTENSION", "FUNCTION",
"GROUP", "INDEX", "LANGUAGE", "NODE", "NODE GROUP", "OPERATOR",
"ROLE", "SCHEMA", "SEQUENCE", "TABLE",
- "TABLESPACE", "TEXT SEARCH", "TRIGGER", "TYPE",
+ "TABLESPACE", "TEXT SEARCH", "TYPE",
"USER", "VIEW", NULL};
#else
{"AGGREGATE", "COLLATION", "CONVERSION", "DATABASE", "DEFAULT PRIVILEGES", "DOMAIN",
@@ -1261,6 +1264,8 @@ psql_completion(char *text, int start, int end)
COMPLETE_WITH_LIST(list_ALTERVIEW);
}
+#ifndef PGXC
+ /* PGXCTODO: This should be re-enabled once TRIGGER is supported */
/* ALTER TRIGGER <name>, add ON */
else if (pg_strcasecmp(prev3_wd, "ALTER") == 0 &&
pg_strcasecmp(prev2_wd, "TRIGGER") == 0)
@@ -1285,6 +1290,7 @@ psql_completion(char *text, int start, int end)
else if (pg_strcasecmp(prev4_wd, "TRIGGER") == 0 &&
pg_strcasecmp(prev2_wd, "ON") == 0)
COMPLETE_WITH_CONST("RENAME TO");
+#endif
/*
* If we detect ALTER TABLE <name>, suggest sub commands
@@ -2081,6 +2087,8 @@ psql_completion(char *text, int start, int end)
pg_strcasecmp(prev2_wd, "CONFIGURATION") == 0)
COMPLETE_WITH_CONST("(");
+#ifndef PGXC
+ /* PGXCTODO: This should be re-enabled once TRIGGER is supported */
/* CREATE TRIGGER */
/* complete CREATE TRIGGER <name> with BEFORE,AFTER */
else if (pg_strcasecmp(prev3_wd, "CREATE") == 0 &&
@@ -2147,6 +2155,7 @@ psql_completion(char *text, int start, int end)
prev2_wd[0] != '\0')
COMPLETE_WITH_CONST("PROCEDURE");
+#endif
/* CREATE ROLE,USER,GROUP <name> */
else if (pg_strcasecmp(prev3_wd, "CREATE") == 0 &&
!(pg_strcasecmp(prev2_wd, "USER") == 0 && pg_strcasecmp(prev_wd, "MAPPING") == 0) &&
diff --git a/src/gtm/Makefile b/src/gtm/Makefile
index 5059642637..480d1bf49e 100644
--- a/src/gtm/Makefile
+++ b/src/gtm/Makefile
@@ -12,6 +12,37 @@ subdir = src/gtm
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
-SUBDIRS = client common config libpq main path proxy recovery
+WANTED_DIRS=common path libpq client recovery main proxy gtm_ctl
-$(recurse)
+all:
+ @for dir in $(WANTED_DIRS); do \
+ $(MAKE) -C $$dir $@ || exit; \
+ done
+
+clobber:
+ @for dir in $(WANTED_DIRS); do \
+ $(MAKE) -C $$dir $@ || exit; \
+ done
+
+clean:
+ @for dir in $(WANTED_DIRS); do \
+ $(MAKE) -C $$dir $@ || exit; \
+ done
+
+distclean: clean
+
+maintainer-clean: distclean
+
+install: all
+ $(INSTALL_PROGRAM) main/gtm$(X) '$(DESTDIR)$(bindir)/gtm$(X)'
+ $(INSTALL_PROGRAM) gtm_ctl/gtm_ctl$(X) '$(DESTDIR)$(bindir)/gtm_ctl$(X)'
+ $(INSTALL_PROGRAM) proxy/gtm_proxy$(X) '$(DESTDIR)$(bindir)/gtm_proxy$(X)'
+ $(INSTALL_DATA) $(srcdir)/main/gtm.conf.sample '$(DESTDIR)$(datadir)/gtm.conf.sample'
+ $(INSTALL_DATA) $(srcdir)/proxy/gtm_proxy.conf.sample '$(DESTDIR)$(datadir)/gtm_proxy.conf.sample'
+
+uninstall:
+ rm -f $(DESTDIR)$(bindir)/gtm$(X)
+ rm -f $(DESTDIR)$(bindir)/gtm_ctl$(X)
+ rm -f $(DESTDIR)$(bindir)/gtm_proxy$(X)
+ rm -f $(DESTDIR)$(datadir)/gtm.conf.sample
+ rm -f $(DESTDIR)$(datadir)/gtm_proxy.conf.sample
diff --git a/src/gtm/client/Makefile b/src/gtm/client/Makefile
index 56dba648ce..e8204bb4b9 100644
--- a/src/gtm/client/Makefile
+++ b/src/gtm/client/Makefile
@@ -11,20 +11,22 @@ top_builddir=../../..
include $(top_builddir)/src/Makefile.global
subdir=src/gtm/client
-override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS)
-LIBS += $(PTHREAD_LIBS)
+NAME=gtmclient
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
-include $(top_srcdir)/src/backend/common.mk
+OBJS=fe-misc.o fe-connect.o pqexpbuffer.o ip.o strlcpy.o gtm_client.o fe-protocol.o
+LDFLAGS=-L$(top_build_dir)/common -L$(top_build_dir)/libpq
-OBJS = fe-misc.o fe-connect.o gtm_client.o fe-protocol.o ip.o pqexpbuffer.o
+LIBS=-lpthread
-all: libgtmclient.a
+all:all-lib
-libgtmclient.a: $(OBJS)
- $(AR) $(AROPT) $@ $^
+include $(top_srcdir)/src/Makefile.shlib
clean:
- rm -f $(OBJS) libgtmclient.a
+ rm -f $(OBJS)
+ rm -f libgtmclient.a libgtmclient.so libgtmclient.so.1 libgtmclient.so.1.0
distclean: clean
diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c
index bfcb5f4e54..1bcb3e6a17 100644
--- a/src/gtm/client/fe-connect.c
+++ b/src/gtm/client/fe-connect.c
@@ -3,6 +3,11 @@
* fe-connect.c
* functions related to setting up a connection to the backend
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -889,6 +894,39 @@ freeGTM_Conn(GTM_Conn *conn)
free(conn->outBuffer);
termGTMPQExpBuffer(&conn->errorMessage);
termGTMPQExpBuffer(&conn->workBuffer);
+#ifdef XCP
+ if (conn->result)
+ {
+ /* Free last snapshot if defined */
+ if (conn->result->gr_snapshot.sn_xip)
+ free(conn->result->gr_snapshot.sn_xip);
+
+ /* Depending on result type there could be allocated data */
+ switch (conn->result->gr_type)
+ {
+ case SEQUENCE_INIT_RESULT:
+ case SEQUENCE_RESET_RESULT:
+ case SEQUENCE_CLOSE_RESULT:
+ case SEQUENCE_RENAME_RESULT:
+ case SEQUENCE_ALTER_RESULT:
+ case SEQUENCE_SET_VAL_RESULT:
+ if (conn->result->gr_resdata.grd_seqkey.gsk_key)
+ free(conn->result->gr_resdata.grd_seqkey.gsk_key);
+ break;
+
+ case SEQUENCE_GET_NEXT_RESULT:
+ case SEQUENCE_GET_LAST_RESULT:
+ if (conn->result->gr_resdata.grd_seq.seqkey.gsk_key)
+ free(conn->result->gr_resdata.grd_seq.seqkey.gsk_key);
+ break;
+
+ default:
+ break;
+ }
+
+ free(conn->result);
+ }
+#endif
free(conn);
}
diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c
index b4a0e3e2fc..1ebf067ec9 100644
--- a/src/gtm/client/fe-protocol.c
+++ b/src/gtm/client/fe-protocol.c
@@ -3,6 +3,11 @@
* fe-protocol3.c
* functions that are specific to frontend/backend protocol version 3
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -366,6 +371,11 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result)
case END_BACKUP_RESULT:
break;
+#ifdef XCP
+ case REGISTER_SESSION_RESULT:
+ break;
+#endif
+
case TXN_BEGIN_RESULT:
if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txnhandle,
sizeof (GTM_TransactionHandle), conn))
@@ -549,6 +559,7 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result)
result->gr_status = GTM_RESULT_ERROR;
break;
+ case SEQUENCE_GET_CURRENT_RESULT:
case SEQUENCE_GET_NEXT_RESULT:
case SEQUENCE_GET_LAST_RESULT:
if (gtmpqReadSeqKey(&result->gr_resdata.grd_seq.seqkey, conn))
@@ -559,6 +570,12 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result)
if (gtmpqGetnchar((char *)&result->gr_resdata.grd_seq.seqval,
sizeof (GTM_Sequence), conn))
result->gr_status = GTM_RESULT_ERROR;
+#ifdef XCP
+ if (result->gr_type == SEQUENCE_GET_NEXT_RESULT &&
+ gtmpqGetnchar((char *)&result->gr_resdata.grd_seq.rangemax,
+ sizeof (GTM_Sequence), conn))
+ result->gr_status = GTM_RESULT_ERROR;
+#endif
break;
case SEQUENCE_LIST_RESULT:
@@ -570,7 +587,7 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result)
}
result->gr_resdata.grd_seq_list.seq =
- (GTM_SeqInfo **)malloc(sizeof(GTM_SeqInfo *) *
+ (GTM_SeqInfo **)malloc(sizeof(GTM_SeqInfo) *
result->gr_resdata.grd_seq_list.seq_count);
for (i = 0 ; i < result->gr_resdata.grd_seq_list.seq_count; i++)
@@ -593,7 +610,8 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result)
break;
}
- result->gr_resdata.grd_seq_list.seq[i] = gtm_deserialize_sequence(buf, buflen);
+ gtm_deserialize_sequence(result->gr_resdata.grd_seq_list.seq+i,
+ buf, buflen);
free(buf);
}
@@ -733,7 +751,7 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result)
for (i = 0 ; i < result->gr_resdata.grd_node_list.num_node; i++)
{
int size;
- char buf[1024];
+ char buf[8092];
GTM_PGXCNodeInfo *data = (GTM_PGXCNodeInfo *)malloc(sizeof(GTM_PGXCNodeInfo));
if (gtmpqGetInt(&size, sizeof(int32), conn))
@@ -741,19 +759,37 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result)
result->gr_status = GTM_RESULT_ERROR;
break;
}
+ if (size > 8092)
+ {
+ result->gr_status = GTM_RESULT_ERROR;
+ printfGTMPQExpBuffer(&conn->errorMessage, "buffer size not large enough for node list data");
+ result->gr_status = GTM_RESULT_ERROR;
+ }
if (gtmpqGetnchar((char *)&buf, size, conn))
{
result->gr_status = GTM_RESULT_ERROR;
break;
}
- gtm_deserialize_pgxcnodeinfo(data, buf, size);
-
+#ifdef XCP
+ if (!gtm_deserialize_pgxcnodeinfo(data, buf, size, &conn->errorMessage))
+ {
+ result->gr_status = GTM_RESULT_ERROR;
+ break;
+ }
+ else
+ {
+ result->gr_resdata.grd_node_list.nodeinfo[i] = data;
+ }
+#else
+ gtm_deserialize_pgxcnodeinfo(data, buf, size, &conn->errorMessage);
result->gr_resdata.grd_node_list.nodeinfo[i] = data;
+#endif
}
break;
}
+
default:
printfGTMPQExpBuffer(&conn->errorMessage,
"unexpected result type from server; result typr was \"%d\"\n",
@@ -813,6 +849,7 @@ gtmpqFreeResultData(GTM_Result *result, GTM_PGXCNodeType remote_type)
result->gr_resdata.grd_seqkey.gsk_key = NULL;
break;
+ case SEQUENCE_GET_CURRENT_RESULT:
case SEQUENCE_GET_NEXT_RESULT:
case SEQUENCE_GET_LAST_RESULT:
if (result->gr_resdata.grd_seq.seqkey.gsk_key != NULL)
diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c
index 0e396d5eb0..d099ba6729 100644
--- a/src/gtm/client/gtm_client.c
+++ b/src/gtm/client/gtm_client.c
@@ -2,6 +2,11 @@
*
* gtm-client.c
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -51,8 +56,17 @@ static int abort_transaction_multi_internal(GTM_Conn *conn, int txn_count, Globa
static int open_sequence_internal(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increment,
GTM_Sequence minval, GTM_Sequence maxval,
GTM_Sequence startval, bool cycle, bool is_backup);
+#ifdef XCP
+static int get_next_internal(GTM_Conn *conn, GTM_SequenceKey key,
+ char *coord_name, int coord_procid, GTM_Sequence range,
+ GTM_Sequence *result, GTM_Sequence *rangemax, bool is_backup);
+static int set_val_internal(GTM_Conn *conn, GTM_SequenceKey key,
+ char *coord_name, int coord_procid, GTM_Sequence nextval,
+ bool iscalled, bool is_backup);
+#else
static GTM_Sequence get_next_internal(GTM_Conn *conn, GTM_SequenceKey key, bool is_backup);
static int set_val_internal(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool iscalled, bool is_backup);
+#endif
static int reset_sequence_internal(GTM_Conn *conn, GTM_SequenceKey key, bool is_backup);
static int commit_transaction_internal(GTM_Conn *conn, GlobalTransactionId gxid, bool is_backup);
static int close_sequence_internal(GTM_Conn *conn, GTM_SequenceKey key, bool is_backup);
@@ -229,6 +243,11 @@ get_node_list(GTM_Conn *conn, GTM_PGXCNodeInfo *data, size_t maxlen)
num_node = res->gr_resdata.grd_node_list.num_node;
fprintf(stderr, "get_node_list: num_node=%ld\n", num_node);
+ if (num_node > maxlen)
+ {
+ fprintf(stderr, "Error: number of nodes %zu greater than maximum", num_node);
+ goto receive_failed;
+ }
for (i = 0; i < num_node; i++)
{
@@ -348,13 +367,14 @@ send_failed:
* get_sequence_list()
*
* returns a number of sequences on success, -1 on failure.
+ * Returned seq_list is pointing to GTM_Result structure, the data should be
+ * copied before the next call to getResult.
*/
size_t
-get_sequence_list(GTM_Conn *conn, GTM_SeqInfo **seq_list, size_t seq_max)
+get_sequence_list(GTM_Conn *conn, GTM_SeqInfo **seq_list)
{
GTM_Result *res = NULL;
time_t finish_time;
- int i;
/* Start the message. */
if (gtmpqPutMsgStart('C', true, conn) ||
@@ -380,15 +400,9 @@ get_sequence_list(GTM_Conn *conn, GTM_SeqInfo **seq_list, size_t seq_max)
if (res->gr_status == GTM_RESULT_OK)
Assert(res->gr_type == SEQUENCE_LIST_RESULT);
- for (i = 0; i < res->gr_resdata.grd_seq_list.seq_count; i++)
- {
- seq_list[i] = res->gr_resdata.grd_seq_list.seq[i];
+ *seq_list = res->gr_resdata.grd_seq_list.seq;
- if ( i >= seq_max )
- break;
- }
-
- return i;
+ return res->gr_resdata.grd_seq_list.seq_count;
receive_failed:
send_failed:
@@ -1041,16 +1055,16 @@ open_sequence_internal(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence increme
gtmpqPutc(cycle, conn))
goto send_failed;
- if (!is_backup)
- {
- /* Finish the message. */
- if (gtmpqPutMsgEnd(conn))
- goto send_failed;
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
- /* Flush to ensure backend gets it. */
- if (gtmpqFlush(conn))
- goto send_failed;
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+ if (!is_backup)
+ {
finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
if (gtmpqWaitTimed(true, false, conn, finish_time) ||
gtmpqReadData(conn) < 0)
@@ -1248,6 +1262,111 @@ send_failed:
return -1;
}
+#ifdef XCP
+/*
+ * Request from GTM current value of the specified sequence in the specified
+ * distributed session.
+ * Function returns GTM_RESULT_OK if the current value is defined, it sets
+ * the *result parameter in this case.
+ * Other return value means a problem. Check GTMPQerrorMessage(conn) for details
+ * about the problem.
+ */
+int
+get_current(GTM_Conn *conn, GTM_SequenceKey key,
+ char *coord_name, int coord_procid, GTM_Sequence *result)
+#else
+GTM_Sequence
+get_current(GTM_Conn *conn, GTM_SequenceKey key)
+#endif
+{
+ GTM_Result *res = NULL;
+ time_t finish_time;
+#ifdef XCP
+ int coord_namelen = coord_name ? strlen(coord_name) : 0;
+
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_SEQUENCE_GET_CURRENT, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+ gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn) ||
+ gtmpqPutInt(coord_namelen, 4, conn) ||
+ (coord_namelen > 0 && gtmpqPutnchar(coord_name, coord_namelen, conn)) ||
+ gtmpqPutInt(coord_procid, 4, conn))
+ goto send_failed;
+#else
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_SEQUENCE_GET_CURRENT, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+ gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn))
+ goto send_failed;
+#endif
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ goto send_failed;
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ goto send_failed;
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ goto receive_failed;
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ goto receive_failed;
+
+#ifdef XCP
+ if (res->gr_status == GTM_RESULT_OK)
+ *result = res->gr_resdata.grd_seq.seqval;
+
+ return res->gr_status;
+#else
+ if (res->gr_status == GTM_RESULT_OK)
+ return res->gr_resdata.grd_seq.seqval;
+ else
+ return InvalidSequenceValue;
+#endif
+
+receive_failed:
+send_failed:
+ conn->result = makeEmptyResultIfIsNull(conn->result);
+ conn->result->gr_status = GTM_RESULT_COMM_ERROR;
+#ifdef XCP
+ return GTM_RESULT_COMM_ERROR;
+#else
+ return -1;
+#endif
+}
+
+#ifdef XCP
+/*
+ * Submit to GTM new next value of the specified sequence in the specified
+ * distributed session. The nextval parameter is the new value, if is called
+ * is set to false the nextval will be the next value returned from the sequence
+ * by nextval() function, if true the function returns incremented value.
+ * Function returns GTM_RESULT_OK if it succeedes.
+ * Other return value means a problem. Check GTMPQerrorMessage(conn) for details
+ * about the problem.
+ */
+int
+set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name,
+ int coord_procid, GTM_Sequence nextval, bool iscalled)
+{
+ return set_val_internal(conn, key, coord_name, coord_procid, nextval,
+ iscalled, false);
+}
+
+int
+bkup_set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name,
+ int coord_procid, GTM_Sequence nextval, bool iscalled)
+{
+ return set_val_internal(conn, key, coord_name, coord_procid, nextval,
+ iscalled, true);
+}
+#else
int
set_val(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool iscalled)
{
@@ -1259,18 +1378,34 @@ bkup_set_val(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool isc
{
return set_val_internal(conn, key, nextval, iscalled, true);
}
+#endif
+#ifdef XCP
+static int
+set_val_internal(GTM_Conn *conn, GTM_SequenceKey key,
+ char *coord_name, int coord_procid, GTM_Sequence nextval,
+ bool iscalled, bool is_backup)
+#else
static int
set_val_internal(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool iscalled, bool is_backup)
+#endif
{
GTM_Result *res = NULL;
time_t finish_time;
+#ifdef XCP
+ int coord_namelen = coord_name ? strlen(coord_name) : 0;
+#endif
/* Start the message. */
if (gtmpqPutMsgStart('C', true, conn) ||
gtmpqPutInt(is_backup ? MSG_BKUP_SEQUENCE_SET_VAL : MSG_SEQUENCE_SET_VAL, sizeof (GTM_MessageType), conn) ||
gtmpqPutInt(key->gsk_keylen, 4, conn) ||
gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn) ||
+#ifdef XCP
+ gtmpqPutInt(coord_namelen, 4, conn) ||
+ (coord_namelen > 0 && gtmpqPutnchar(coord_name, coord_namelen, conn)) ||
+ gtmpqPutInt(coord_procid, 4, conn) ||
+#endif
gtmpqPutnchar((char *)&nextval, sizeof (GTM_Sequence), conn) ||
gtmpqPutc(iscalled, conn))
goto send_failed;
@@ -1301,9 +1436,39 @@ receive_failed:
send_failed:
conn->result = makeEmptyResultIfIsNull(conn->result);
conn->result->gr_status = GTM_RESULT_COMM_ERROR;
+#ifdef XCP
+ return GTM_RESULT_COMM_ERROR;
+#else
return -1;
+#endif
+}
+
+#ifdef XCP
+/*
+ * Rexuest from GTM next value of the specified sequence.
+ * Function returns GTM_RESULT_OK if it succeedes, it sets the *result parameter
+ * in this case.
+ * Other return value means a problem. Check GTMPQerrorMessage(conn) for details
+ * about the problem.
+ */
+int
+get_next(GTM_Conn *conn, GTM_SequenceKey key,
+ char *coord_name, int coord_procid, GTM_Sequence range,
+ GTM_Sequence *result, GTM_Sequence *rangemax)
+{
+ return get_next_internal(conn, key, coord_name, coord_procid,
+ range, result, rangemax, false);
}
+int
+bkup_get_next(GTM_Conn *conn, GTM_SequenceKey key,
+ char *coord_name, int coord_procid, GTM_Sequence range,
+ GTM_Sequence *result, GTM_Sequence *rangemax)
+{
+ return get_next_internal(conn, key, coord_name, coord_procid,
+ range, result, rangemax, true);
+}
+#else
GTM_Sequence
get_next(GTM_Conn *conn, GTM_SequenceKey key)
{
@@ -1315,19 +1480,41 @@ bkup_get_next(GTM_Conn *conn, GTM_SequenceKey key)
{
return get_next_internal(conn, key, true);
}
+#endif
+#ifdef XCP
+static int
+get_next_internal(GTM_Conn *conn, GTM_SequenceKey key,
+ char *coord_name, int coord_procid, GTM_Sequence range,
+ GTM_Sequence *result, GTM_Sequence *rangemax, bool is_backup)
+#else
static GTM_Sequence
get_next_internal(GTM_Conn *conn, GTM_SequenceKey key, bool is_backup)
+#endif
{
GTM_Result *res = NULL;
time_t finish_time;
+#ifdef XCP
+ int coord_namelen = coord_name ? strlen(coord_name) : 0;
/* Start the message. */
if (gtmpqPutMsgStart('C', true, conn) ||
gtmpqPutInt(is_backup ? MSG_BKUP_SEQUENCE_GET_NEXT : MSG_SEQUENCE_GET_NEXT, sizeof (GTM_MessageType), conn) ||
gtmpqPutInt(key->gsk_keylen, 4, conn) ||
+ gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn) ||
+ gtmpqPutInt(coord_namelen, 4, conn) ||
+ (coord_namelen > 0 && gtmpqPutnchar(coord_name, coord_namelen, conn)) ||
+ gtmpqPutInt(coord_procid, 4, conn) ||
+ gtmpqPutnchar((char *)&range, sizeof (GTM_Sequence), conn))
+ goto send_failed;
+#else
+ /* Start the message. */
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(is_backup ? MSG_BKUP_SEQUENCE_GET_NEXT : MSG_SEQUENCE_GET_NEXT, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutInt(key->gsk_keylen, 4, conn) ||
gtmpqPutnchar(key->gsk_key, key->gsk_keylen, conn))
goto send_failed;
+#endif
/* Finish the message. */
if (gtmpqPutMsgEnd(conn))
@@ -1347,10 +1534,19 @@ get_next_internal(GTM_Conn *conn, GTM_SequenceKey key, bool is_backup)
if ((res = GTMPQgetResult(conn)) == NULL)
goto receive_failed;
+#ifdef XCP
+ if (res->gr_status == GTM_RESULT_OK)
+ {
+ *result = res->gr_resdata.grd_seq.seqval;
+ *rangemax = res->gr_resdata.grd_seq.rangemax;
+ }
+ return res->gr_status;
+#else
if (res->gr_status == GTM_RESULT_OK)
return res->gr_resdata.grd_seq.seqval;
else
return InvalidSequenceValue;
+#endif
}
return GTM_RESULT_OK;
@@ -1358,7 +1554,11 @@ receive_failed:
send_failed:
conn->result = makeEmptyResultIfIsNull(conn->result);
conn->result->gr_status = GTM_RESULT_COMM_ERROR;
+#ifdef XCP
+ return GTM_RESULT_COMM_ERROR;
+#else
return -1;
+#endif
}
int
@@ -2116,3 +2316,67 @@ send_failed:
conn->result->gr_status = GTM_RESULT_COMM_ERROR;
return -1;
}
+
+
+#ifdef XCP
+/*
+ * Submit to GTM information about started distributed session.
+ * The information is the session identifier consisting of coordinator name and
+ * pid of the master process, and the BackendId of the master process.
+ * The BackendId is used to track session end. BackendIds are the sequential
+ * numbers from 1 to max_connections, and they are unique among active sessions
+ * under the same postmaster. So if another session on the same coordinator with
+ * the same BackendId is registering, that means the previous session is closed
+ * and all resources assigned to it could be released.
+ */
+int
+register_session(GTM_Conn *conn, const char *coord_name, int coord_procid,
+ int coord_backendid)
+{
+ GTM_Result *res = NULL;
+ time_t finish_time;
+ int32 len = strlen(coord_name);
+
+ if (gtmpqPutMsgStart('C', true, conn) ||
+ gtmpqPutInt(MSG_REGISTER_SESSION, sizeof (GTM_MessageType), conn) ||
+ gtmpqPutInt(len, sizeof(len), conn) ||
+ gtmpqPutnchar(coord_name, len, conn) ||
+ gtmpqPutInt(coord_procid, sizeof(coord_procid), conn) ||
+ gtmpqPutInt(coord_backendid, sizeof(coord_backendid), conn))
+ {
+ goto send_failed;
+ }
+
+ /* Finish the message. */
+ if (gtmpqPutMsgEnd(conn))
+ {
+ goto send_failed;
+ }
+
+ /* Flush to ensure backend gets it. */
+ if (gtmpqFlush(conn))
+ {
+ goto send_failed;
+ }
+
+ finish_time = time(NULL) + CLIENT_GTM_TIMEOUT;
+ if (gtmpqWaitTimed(true, false, conn, finish_time) ||
+ gtmpqReadData(conn) < 0)
+ {
+ goto receive_failed;
+ }
+
+ if ((res = GTMPQgetResult(conn)) == NULL)
+ {
+ goto receive_failed;
+ }
+
+ return res->gr_status;
+
+receive_failed:
+send_failed:
+ conn->result = makeEmptyResultIfIsNull(conn->result);
+ conn->result->gr_status = GTM_RESULT_COMM_ERROR;
+ return -1;
+}
+#endif
diff --git a/src/gtm/client/strlcpy.c b/src/gtm/client/strlcpy.c
new file mode 100644
index 0000000000..48cdf5e2c9
--- /dev/null
+++ b/src/gtm/client/strlcpy.c
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * strlcpy.c
+ * strncpy done right
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL: pgsql/src/port/strlcpy.c,v 1.5 2008/01/01 19:46:00 momjian Exp $
+ *
+ * This file was taken from OpenBSD and is used on platforms that don't
+ * provide strlcpy(). The OpenBSD copyright terms follow.
+ *-------------------------------------------------------------------------
+ */
+
+/* $OpenBSD: strlcpy.c,v 1.11 2006/05/05 15:27:38 millert Exp $ */
+
+/*
+ * Copyright (c) 1998 Todd C. Miller <[email protected]>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "gtm/gtm_c.h"
+
+
+/*
+ * Copy src to string dst of size siz. At most siz-1 characters
+ * will be copied. Always NUL terminates (unless siz == 0).
+ * Returns strlen(src); if retval >= siz, truncation occurred.
+ * Function creation history: http://www.gratisoft.us/todd/papers/strlcpy.html
+ */
+size_t
+strlcpy(char *dst, const char *src, size_t siz)
+{
+ char *d = dst;
+ const char *s = src;
+ size_t n = siz;
+
+ /* Copy as many bytes as will fit */
+ if (n != 0)
+ {
+ while (--n != 0)
+ {
+ if ((*d++ = *s++) == '\0')
+ break;
+ }
+ }
+
+ /* Not enough room in dst, add NUL and traverse rest of src */
+ if (n == 0)
+ {
+ if (siz != 0)
+ *d = '\0'; /* NUL-terminate dst */
+ while (*s++)
+ ;
+ }
+
+ return (s - src - 1); /* count does not include NUL */
+}
diff --git a/src/gtm/common/.gitignore b/src/gtm/common/.gitignore
new file mode 100644
index 0000000000..5963e7b19a
--- /dev/null
+++ b/src/gtm/common/.gitignore
@@ -0,0 +1 @@
+/gtm_opt_scanner.c
diff --git a/src/gtm/common/Makefile b/src/gtm/common/Makefile
index c43e000ead..31e0c25ff9 100644
--- a/src/gtm/common/Makefile
+++ b/src/gtm/common/Makefile
@@ -8,24 +8,41 @@
#
#-----------------------------------------------------------------------------
top_builddir=../../..
-include $(top_builddir)/src/Makefile.global
subdir=src/gtm/common
-override CPPFLAGS := -I. -I$(libpq_srcdir) $(CPPFLAGS)
-LIBS += $(PTHREAD_LIBS)
+include $(top_builddir)/src/Makefile.global
+
+override CPPFLAGS := -I. -I$(srcdir) $(CPPFLAGS)
+
+NAME=gtm
+
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
+
+LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq
+LIBS=-lpthread
+
+OBJS = gtm_opt_handler.o aset.o mcxt.o gtm_utils.o elog.o assert.o stringinfo.o gtm_lock.o \
+ gtm_list.o gtm_serialize.o gtm_serialize_debug.o
-include $(top_srcdir)/src/backend/common.mk
+all:all-lib
-OBJS = gtm_utils.o gtm_lock.o gtm_serialize.o gtm_serialize_debug.o \
- aset.o assert.o elog.o mcxt.o stringinfo.o gtm_list.o
+gtm_opt_handler.o: gtm_opt_scanner.c
-all: libgtmcommon.a
+gtm_opt_scanner.c: gtm_opt_scanner.l
+ifdef FLEX
+ $(FLEX) $(FLEXFLAGS) -o'$@' $<
+else
+ @$(missing) flex $< $@
+endif
-libgtmcommon.a: $(OBJS)
- $(AR) $(AROPT) $@ $^
+# Shared library stuff
+include $(top_srcdir)/src/Makefile.shlib
+# Note that gtm_opt_scanner.c is not deleted by make clean as we want it in distribution tarballs
clean:
- rm -f $(OBJS) libgtmcommon.a
+ rm -f $(OBJS)
+ rm -f libgtm.so libgtm.so.1 libgtm.so.1.0
distclean: clean
diff --git a/src/gtm/common/gtm_opt_handler.c b/src/gtm/common/gtm_opt_handler.c
new file mode 100644
index 0000000000..61c2476599
--- /dev/null
+++ b/src/gtm/common/gtm_opt_handler.c
@@ -0,0 +1,3509 @@
+/* -*-pgsql-c-*- */
+/*
+ * Scanner for the configuration file
+ *
+ * Copyright (c) 2000-2011, PostgreSQL Global Development Group
+ *
+ * src/backend/utils/misc/guc-file.l
+ */
+
+#include "gtm/gtm.h"
+
+#include <ctype.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "mb/pg_wchar.h"
+#include "gtm/path.h"
+#include "gtm/assert.h"
+#include "gtm/gtm_opt.h"
+#include "gtm/gtm_opt_tables.h"
+#include "gtm/elog.h"
+#include "gtm_opt_scanner.c"
+
+/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
+#undef fprintf
+#define fprintf(file, fmt, msg) ereport(ERROR, (errmsg_internal("%s", msg)))
+
+static unsigned int ConfigFileLineno;
+
+/* flex fails to supply a prototype for GTMOPT_yylex, so provide one */
+int GTMOPT_GTMOPT_yylex(void);
+
+/* Functions defined in this file */
+static char *GTMOPT_scanstr(const char *s);
+static struct config_generic *find_option(const char *name, bool create_placeholders, int elevel);
+static char *gtm_opt_strdup(int elevel, const char *src);
+static int gtm_opt_name_compare(const char *namea, const char *nameb);
+struct config_generic **get_gtm_opt_variables(void);
+void build_gtm_opt_variables(void);
+static bool gtm_opt_parse_bool(const char *value, bool *result);
+static bool gtm_opt_parse_bool_with_len(const char *value, size_t len, bool *result);
+static void set_config_sourcefile(const char *name, char *sourcefile, int sourceline);
+static int gtm_opt_var_compare(const void *a, const void *b);
+static void InitializeOneGTMOption(struct config_generic * gconf);
+static void ReportGTMOption(struct config_generic * record);
+static char *_ShowOption(struct config_generic * record, bool use_units);
+
+/*
+ * Variables to bel fed by specific option definition: gtm_opt.c and gtm_proxy_opt.c
+ */
+extern char *GTMConfigFileName;
+extern char *data_directory;
+extern struct config_generic **gtm_opt_variables;
+extern int num_gtm_opt_variables;
+extern int size_gtm_opt_variables;
+extern bool reporting_enabled; /* TRUE to enable GTMOPT_REPORT */
+extern char *config_filename; /* Default configuration file name */
+extern int GTMOptUpdateCount; /* Indicates when specific option is updated */
+extern bool isStartUp;
+
+/*
+ * Tables of options: to be defined in gtm_opt.c and gtm_proxy_opt.c
+ */
+extern struct config_bool ConfigureNamesBool[];
+extern struct config_int ConfigureNamesInt[];
+extern struct config_real ConfigureNamesReal[];
+extern struct config_string ConfigureNamesString[];
+extern struct config_enum ConfigureNamesEnum[];
+
+/*
+ * Note: MAX_BACKENDS is limited to 2^23-1 because inval.c stores the
+ * backend ID as a 3-byte signed integer. Even if that limitation were
+ * removed, we still could not exceed INT_MAX/4 because some places compute
+ * 4*MaxBackends without any overflow check. This is rechecked in
+ * check_maxconnections, since MaxBackends is computed as MaxConnections
+ * plus autovacuum_max_workers plus one (for the autovacuum launcher).
+ */
+#define MAX_BACKENDS 0x7fffff
+
+#define KB_PER_MB (1024)
+#define KB_PER_GB (1024*1024)
+
+#define MS_PER_S 1000
+#define S_PER_MIN 60
+#define MS_PER_MIN (1000 * 60)
+#define MIN_PER_H 60
+#define S_PER_H (60 * 60)
+#define MS_PER_H (1000 * 60 * 60)
+#define MIN_PER_D (60 * 24)
+#define S_PER_D (60 * 60 * 24)
+#define MS_PER_D (1000 * 60 * 60 * 24)
+
+/*
+ * Exported function to read and process the configuration file. The
+ * parameter indicates in what context the file is being read --- either
+ * postmaster startup (including standalone-backend startup) or SIGHUP.
+ * All options mentioned in the configuration file are set to new values.
+ * If an error occurs, no values will be changed.
+ */
+void
+ProcessConfigFile(GtmOptContext context)
+{
+ int elevel;
+ ConfigVariable *item,
+ *head,
+ *tail;
+ char *cvc = NULL;
+ int i;
+
+ Assert((context == GTMC_STARTUP || context == GTMC_SIGHUP));
+
+ if (context == GTMC_SIGHUP)
+ elevel = DEBUG2;
+ else
+ elevel = ERROR;
+
+ /* Parse the file into a list of option names and values */
+ head = tail = NULL;
+
+ if (!ParseConfigFile(GTMConfigFileName, NULL, 0, elevel, &head, &tail))
+ goto cleanup_list;
+
+#if 0
+ /* No custom_variable_classes now */
+ /*
+ * This part of the code remained the same as original guc.c because
+ * we might want to have custom variable class for gtm.conf.
+ */
+ /*
+ * We need the proposed new value of custom_variable_classes to check
+ * custom variables with. ParseConfigFile ensured that if it's in
+ * the file, it's first in the list. But first check to see if we
+ * have an active value from the command line, which should override
+ * the file in any case. (Since there's no relevant env var, the
+ * only possible nondefault sources are the file and ARGV.)
+ */
+ cvc_struct = (struct config_string *)
+ find_option("custom_variable_classes", false, elevel);
+ Assert(cvc_struct);
+ if (cvc_struct->gen.reset_source > GTMC_S_FILE)
+ {
+ cvc = gtm_opt_strdup(elevel, cvc_struct->reset_val);
+ if (cvc == NULL)
+ goto cleanup_list;
+ }
+ else if (head != NULL &&
+ gtm_opt_name_compare(head->name, "custom_variable_classes") == 0)
+ {
+ /*
+ * Need to canonicalize the value by calling the check hook.
+ */
+ void *extra = NULL;
+
+ cvc = gtm_opt_strdup(elevel, head->value);
+ if (cvc == NULL)
+ goto cleanup_list;
+ if (extra)
+ free(extra);
+ }
+#endif
+
+ /*
+ * Mark all extant GUC variables as not present in the config file.
+ * We need this so that we can tell below which ones have been removed
+ * from the file since we last processed it.
+ */
+ for (i = 0; i < num_gtm_opt_variables; i++)
+ {
+ struct config_generic *gconf = gtm_opt_variables[i];
+
+ gconf->status &= ~GTMOPT_IS_IN_FILE;
+ }
+
+ /*
+ * Check if all options are valid. As a side-effect, the GTMOPT_IS_IN_FILE
+ * flag is set on each GUC variable mentioned in the list.
+ */
+ for (item = head; item; item = item->next)
+ {
+ char *sep = strchr(item->name, GTMOPT_QUALIFIER_SEPARATOR);
+
+ if (sep)
+ {
+ /*
+ * There is no GUC entry. If we called set_config_option then
+ * it would make a placeholder, which we don't want to do yet,
+ * since we could still fail further down the list. Do nothing
+ * (assuming that making the placeholder will succeed later).
+ */
+ if (find_option(item->name, false, elevel) == NULL)
+ continue;
+ /*
+ * 3. There is already a GUC entry (either real or placeholder) for
+ * the variable. In this case we should let set_config_option
+ * check it, since the assignment could well fail if it's a real
+ * entry.
+ */
+ }
+
+ if (!set_config_option(item->name, item->value, context,
+ GTMC_S_FILE, false))
+ goto cleanup_list;
+ }
+
+ /*
+ * Check for variables having been removed from the config file, and
+ * revert their reset values (and perhaps also effective values) to the
+ * boot-time defaults. If such a variable can't be changed after startup,
+ * just throw a warning and continue. (This is analogous to the fact that
+ * set_config_option only throws a warning for a new but different value.
+ * If we wanted to make it a hard error, we'd need an extra pass over the
+ * list so that we could throw the error before starting to apply
+ * changes.)
+ */
+ for (i = 0; i < num_gtm_opt_variables; i++)
+ {
+ struct config_generic *gconf = gtm_opt_variables[i];
+ GtmOptStack *stack;
+
+ if (gconf->reset_source != GTMC_S_FILE ||
+ (gconf->status & GTMOPT_IS_IN_FILE))
+ continue;
+ if (gconf->context < GTMC_SIGHUP)
+ {
+ /*
+ * In the original code, errcode() stores specified error code to sqlerrcode, which does not
+ * exist in GTM.
+ */
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" cannot be changed without restarting the server",
+ gconf->name);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("parameter \"%s\" cannot be changed without restarting the server",
+ gconf->name)));
+ }
+ continue;
+ }
+
+ /*
+ * Reset any "file" sources to "default", else set_config_option
+ * will not override those settings.
+ */
+ if (gconf->reset_source == GTMC_S_FILE)
+ gconf->reset_source = GTMC_S_DEFAULT;
+ if (gconf->source == GTMC_S_FILE)
+ gconf->source = GTMC_S_DEFAULT;
+ for (stack = gconf->stack; stack; stack = stack->prev)
+ {
+ if (stack->source == GTMC_S_FILE)
+ stack->source = GTMC_S_DEFAULT;
+ }
+
+ /* Now we can re-apply the wired-in default (i.e., the boot_val) */
+ set_config_option(gconf->name, NULL, context, GTMC_S_DEFAULT,
+ true);
+ if (context == GTMC_SIGHUP)
+ {
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" removed from configuration file, reset to default\n",
+ gconf->name);
+ }
+ else
+ {
+ ereport(elevel,
+ (errmsg("parameter \"%s\" removed from configuration file, reset to default",
+ gconf->name)));
+ }
+ }
+ }
+
+ /*
+ * Restore any variables determined by environment variables or
+ * dynamically-computed defaults. This is a no-op except in the case
+ * where one of these had been in the config file and is now removed.
+ *
+ * In particular, we *must not* do this during the postmaster's
+ * initial loading of the file, since the timezone functions in
+ * particular should be run only after initialization is complete.
+ *
+ * XXX this is an unmaintainable crock, because we have to know how
+ * to set (or at least what to call to set) every variable that could
+ * potentially have GTMC_S_DYNAMIC_DEFAULT or GTMC_S_ENV_VAR source.
+ * However, there's no time to redesign it for 9.1.
+ */
+
+ /* If we got here all the options checked out okay, so apply them. */
+ for (item = head; item; item = item->next)
+ {
+ char *pre_value = NULL;
+
+ if (set_config_option(item->name, item->value, context,
+ GTMC_S_FILE, true))
+ {
+ set_config_sourcefile(item->name, item->filename,
+ item->sourceline);
+
+ if (pre_value)
+ {
+ const char *post_value = GetConfigOption(item->name, false);
+
+ if (!post_value)
+ post_value = "";
+ if (strcmp(pre_value, post_value) != 0)
+ {
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" changed to \"%s\"\n",
+ item->name, item->value);
+ }
+ else
+ {
+ ereport(elevel,
+ (errmsg("parameter \"%s\" changed to \"%s\"",
+ item->name, item->value)));
+ }
+ }
+ }
+ }
+
+ if (pre_value)
+ free(pre_value);
+ }
+
+ /* PGXCTODO: configuration file reload time update */
+
+cleanup_list:
+ FreeConfigVariables(head);
+ if (cvc)
+ free(cvc);
+}
+
+/*
+ * See next function for details. This one will just work with a config_file
+ * name rather than an already opened File Descriptor
+ */
+bool
+ParseConfigFile(const char *config_file, const char *calling_file,
+ int depth, int elevel,
+ ConfigVariable **head_p,
+ ConfigVariable **tail_p)
+{
+ bool OK = true;
+ FILE *fp;
+ char abs_path[MAXPGPATH];
+
+ /*
+ * Reject too-deep include nesting depth. This is just a safety check
+ * to avoid dumping core due to stack overflow if an include file loops
+ * back to itself. The maximum nesting depth is pretty arbitrary.
+ */
+ if (depth > 10)
+ {
+ if (isStartUp)
+ {
+ write_stderr("could not open configuration file \"%s\": maximum nesting depth exceeded\n",
+ config_file);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("could not open configuration file \"%s\": maximum nesting depth exceeded",
+ config_file)));
+ }
+ return false;
+ }
+
+ /*
+ * If config_file is a relative path, convert to absolute. We consider
+ * it to be relative to the directory holding the calling file.
+ */
+ if (!is_absolute_path(config_file))
+ {
+ if (calling_file != NULL)
+ {
+ strlcpy(abs_path, calling_file, sizeof(abs_path));
+ get_parent_directory(abs_path);
+ join_path_components(abs_path, abs_path, config_file);
+ canonicalize_path(abs_path);
+ config_file = abs_path;
+ }
+ else
+ {
+ /*
+ * calling_file is NULL, we make an absolute path from $PGDATA
+ */
+ join_path_components(abs_path, data_directory, config_file);
+ canonicalize_path(abs_path);
+ config_file = abs_path;
+ }
+ }
+
+ fp = fopen(config_file, "r");
+ if (!fp)
+ {
+ if (isStartUp)
+ {
+ write_stderr("could not open configuration file \"%s\": %m\n",
+ config_file);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("could not open configuration file \"%s\": %m",
+ config_file)));
+ }
+ return false;
+ }
+
+ OK = ParseConfigFp(fp, config_file, depth, elevel, head_p, tail_p);
+
+ fclose(fp);
+
+ return OK;
+}
+
+/*
+ * Read and parse a single configuration file. This function recurses
+ * to handle "include" directives.
+ *
+ * Input parameters:
+ * fp: file pointer from AllocateFile for the configuration file to parse
+ * config_file: absolute or relative path of file to read
+ * depth: recursion depth (used only to prevent infinite recursion)
+ * elevel: error logging level determined by ProcessConfigFile()
+ * Output parameters:
+ * head_p, tail_p: head and tail of linked list of name/value pairs
+ *
+ * *head_p and *tail_p must be initialized to NULL before calling the outer
+ * recursion level. On exit, they contain a list of name-value pairs read
+ * from the input file(s).
+ *
+ * Returns TRUE if successful, FALSE if an error occurred. The error has
+ * already been ereport'd, it is only necessary for the caller to clean up
+ * its own state and release the name/value pairs list.
+ *
+ * Note: if elevel >= ERROR then an error will not return control to the
+ * caller, and internal state such as open files will not be cleaned up.
+ * This case occurs only during postmaster or standalone-backend startup,
+ * where an error will lead to immediate process exit anyway; so there is
+ * no point in contorting the code so it can clean up nicely.
+ */
+bool
+ParseConfigFp(FILE *fp, const char *config_file, int depth, int elevel,
+ ConfigVariable **head_p, ConfigVariable **tail_p)
+{
+ bool OK = true;
+ YY_BUFFER_STATE lex_buffer;
+ int token;
+
+ /*
+ * Parse
+ */
+ lex_buffer = GTMOPT_yy_create_buffer(fp, YY_BUF_SIZE);
+ GTMOPT_yy_switch_to_buffer(lex_buffer);
+
+ ConfigFileLineno = 1;
+
+ /* This loop iterates once per logical line */
+ while ((token = GTMOPT_yylex()))
+ {
+ char *opt_name, *opt_value;
+ ConfigVariable *item;
+
+ if (token == GTMOPT_EOL) /* empty or comment line */
+ continue;
+
+ /* first token on line is option name */
+ if (token != GTMOPT_ID && token != GTMOPT_QUALIFIED_ID)
+ goto parse_error;
+ opt_name = strdup(GTMOPT_yytext);
+
+ /* next we have an optional equal sign; discard if present */
+ token = GTMOPT_yylex();
+ if (token == GTMOPT_EQUALS)
+ token = GTMOPT_yylex();
+
+ /* now we must have the option value */
+ if (token != GTMOPT_ID &&
+ token != GTMOPT_STRING &&
+ token != GTMOPT_INTEGER &&
+ token != GTMOPT_REAL &&
+ token != GTMOPT_UNQUOTED_STRING)
+ goto parse_error;
+ if (token == GTMOPT_STRING) /* strip quotes and escapes */
+ opt_value = GTMOPT_scanstr(GTMOPT_yytext);
+ else
+ opt_value = strdup(GTMOPT_yytext);
+
+ /* now we'd like an end of line, or possibly EOF */
+ token = GTMOPT_yylex();
+ if (token != GTMOPT_EOL)
+ {
+ if (token != 0)
+ goto parse_error;
+ /* treat EOF like \n for line numbering purposes, cf bug 4752 */
+ ConfigFileLineno++;
+ }
+
+ /* OK, process the option name and value */
+ if (gtm_opt_name_compare(opt_name, "include") == 0)
+ {
+ /*
+ * An include directive isn't a variable and should be processed
+ * immediately.
+ */
+ unsigned int save_ConfigFileLineno = ConfigFileLineno;
+
+ if (!ParseConfigFile(opt_value, config_file,
+ depth + 1, elevel,
+ head_p, tail_p))
+ {
+ free(opt_name);
+ free(opt_value);
+ OK = false;
+ goto cleanup_exit;
+ }
+ GTMOPT_yy_switch_to_buffer(lex_buffer);
+ ConfigFileLineno = save_ConfigFileLineno;
+ free(opt_name);
+ free(opt_value);
+ }
+ else if (gtm_opt_name_compare(opt_name, "custom_variable_classes") == 0)
+ {
+ /*
+ * This variable must be processed first as it controls
+ * the validity of other variables; so it goes at the head
+ * of the result list. If we already found a value for it,
+ * replace with this one.
+ */
+ item = *head_p;
+ if (item != NULL &&
+ gtm_opt_name_compare(item->name, "custom_variable_classes") == 0)
+ {
+ /* replace existing head item */
+ free(item->name);
+ free(item->value);
+ item->name = opt_name;
+ item->value = opt_value;
+ item->filename = strdup(config_file);
+ item->sourceline = ConfigFileLineno-1;
+ }
+ else
+ {
+ /* prepend to list */
+ item = malloc(sizeof *item);
+ item->name = opt_name;
+ item->value = opt_value;
+ item->filename = strdup(config_file);
+ item->sourceline = ConfigFileLineno-1;
+ item->next = *head_p;
+ *head_p = item;
+ if (*tail_p == NULL)
+ *tail_p = item;
+ }
+ }
+ else
+ {
+ /* ordinary variable, append to list */
+ item = malloc(sizeof *item);
+ item->name = opt_name;
+ item->value = opt_value;
+ item->filename = strdup(config_file);
+ item->sourceline = ConfigFileLineno-1;
+ item->next = NULL;
+ if (*head_p == NULL)
+ *head_p = item;
+ else
+ (*tail_p)->next = item;
+ *tail_p = item;
+ }
+
+ /* break out of loop if read EOF, else loop for next line */
+ if (token == 0)
+ break;
+ }
+
+ /* successful completion of parsing */
+ goto cleanup_exit;
+
+ parse_error:
+ if (token == GTMOPT_EOL || token == 0)
+ {
+ if (isStartUp)
+ {
+ write_stderr("syntax error in file \"%s\" line %u, near end of line\n",
+ config_file, ConfigFileLineno - 1);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("syntax error in file \"%s\" line %u, near end of line",
+ config_file, ConfigFileLineno - 1)));
+ }
+ }
+ else
+ {
+ if (isStartUp)
+ {
+ write_stderr("syntax error in file \"%s\" line %u, near token \"%s\"\n",
+ config_file, ConfigFileLineno, GTMOPT_yytext);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("syntax error in file \"%s\" line %u, near token \"%s\"",
+ config_file, ConfigFileLineno, GTMOPT_yytext)));
+ }
+ }
+ OK = false;
+
+cleanup_exit:
+ GTMOPT_yy_delete_buffer(lex_buffer);
+ return OK;
+}
+
+
+/*
+ * Free a list of ConfigVariables, including the names and the values
+ */
+void
+FreeConfigVariables(ConfigVariable *list)
+{
+ ConfigVariable *item;
+
+ item = list;
+ while (item)
+ {
+ ConfigVariable *next = item->next;
+
+ free(item->name);
+ free(item->value);
+ free(item->filename);
+ free(item);
+ item = next;
+ }
+}
+
+
+/*
+ * scanstr
+ *
+ * Strip the quotes surrounding the given string, and collapse any embedded
+ * '' sequences and backslash escapes.
+ *
+ * the string returned is malloc'd and should eventually be free'd by the
+ * caller.
+ */
+static char *
+GTMOPT_scanstr(const char *s)
+{
+ char *newStr;
+ int len,
+ i,
+ j;
+
+ Assert(s != NULL && s[0] == '\'');
+ len = strlen(s);
+ Assert(len >= 2);
+ Assert(s[len-1] == '\'');
+
+ /* Skip the leading quote; we'll handle the trailing quote below */
+ s++, len--;
+
+ /* Since len still includes trailing quote, this is enough space */
+ newStr = malloc(len);
+
+ for (i = 0, j = 0; i < len; i++)
+ {
+ if (s[i] == '\\')
+ {
+ i++;
+ switch (s[i])
+ {
+ case 'b':
+ newStr[j] = '\b';
+ break;
+ case 'f':
+ newStr[j] = '\f';
+ break;
+ case 'n':
+ newStr[j] = '\n';
+ break;
+ case 'r':
+ newStr[j] = '\r';
+ break;
+ case 't':
+ newStr[j] = '\t';
+ break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ {
+ int k;
+ long octVal = 0;
+
+ for (k = 0;
+ s[i + k] >= '0' && s[i + k] <= '7' && k < 3;
+ k++)
+ octVal = (octVal << 3) + (s[i + k] - '0');
+ i += k - 1;
+ newStr[j] = ((char) octVal);
+ }
+ break;
+ default:
+ newStr[j] = s[i];
+ break;
+ } /* switch */
+ }
+ else if (s[i] == '\'' && s[i+1] == '\'')
+ {
+ /* doubled quote becomes just one quote */
+ newStr[j] = s[++i];
+ }
+ else
+ newStr[j] = s[i];
+ j++;
+ }
+
+ /* We copied the ending quote to newStr, so replace with \0 */
+ Assert(j > 0 && j <= len);
+ newStr[--j] = '\0';
+
+ return newStr;
+}
+
+/*
+ * The following code includes most of the code ported from guc.c.
+ * Because they should be shared by gtm_opt.c and gtm_proxy_opt.c, they are placed here.
+ */
+
+/*
+ * Some infrastructure for checking malloc/strdup/realloc calls
+ */
+static void *
+gtm_opt_malloc(int elevel, size_t size)
+{
+ void *data;
+
+ data = malloc(size);
+ if (data == NULL)
+ {
+ if (isStartUp)
+ {
+ write_stderr("out of memory\n");
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("out of memory")));
+ }
+ }
+ return data;
+}
+
+#if 0
+/* PGXCTODO: this will be used for future extensions */
+static void *
+gtm_opt_realloc(int elevel, void *old, size_t size)
+{
+ void *data;
+
+ data = realloc(old, size);
+ if (data == NULL)
+ {
+ if (isStartUp)
+ {
+ write_stderr("out of memory\n");
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("out of memory")));
+ }
+ }
+ return data;
+}
+#endif
+
+static char *
+gtm_opt_strdup(int elevel, const char *src)
+{
+ char *data;
+
+ data = strdup(src);
+ if (data == NULL)
+ {
+ if (isStartUp)
+ {
+ write_stderr("out of memory\n");
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("out of memory")));
+ }
+ }
+ return data;
+}
+
+/*
+ * Detect whether strval is referenced anywhere in a GTM string item
+ */
+static bool
+string_field_used(struct config_string * conf, char *strval)
+{
+ GtmOptStack *stack;
+
+ if (strval == *(conf->variable) ||
+ strval == conf->reset_val ||
+ strval == conf->boot_val)
+ return true;
+ for (stack = conf->gen.stack; stack; stack = stack->prev)
+ {
+ if (strval == stack->prior.val.stringval ||
+ strval == stack->masked.val.stringval)
+ return true;
+ }
+ return false;
+}
+
+
+/*
+ * Support for assigning to a field of a string GTM item. Free the prior
+ * value if it's not referenced anywhere else in the item (including stacked
+ * states).
+ */
+static void
+set_string_field(struct config_string * conf, char **field, char *newval)
+{
+ char *oldval = *field;
+
+ /* Do the assignment */
+ *field = newval;
+
+ /* Free old value if it's not NULL and isn't referenced anymore */
+ if (oldval && !string_field_used(conf, oldval))
+ free(oldval);
+}
+
+
+/*
+ * Detect whether an "extra" struct is referenced anywhere in a GTM item
+ */
+static bool
+extra_field_used(struct config_generic * gconf, void *extra)
+{
+ GtmOptStack *stack;
+
+ if (extra == gconf->extra)
+ return true;
+ switch (gconf->vartype)
+ {
+ case GTMC_BOOL:
+ if (extra == ((struct config_bool *) gconf)->reset_extra)
+ return true;
+ break;
+ case GTMC_INT:
+ if (extra == ((struct config_int *) gconf)->reset_extra)
+ return true;
+ break;
+ case GTMC_REAL:
+ if (extra == ((struct config_real *) gconf)->reset_extra)
+ return true;
+ break;
+ case GTMC_STRING:
+ if (extra == ((struct config_string *) gconf)->reset_extra)
+ return true;
+ break;
+ case GTMC_ENUM:
+ if (extra == ((struct config_enum *) gconf)->reset_extra)
+ return true;
+ break;
+ }
+ for (stack = gconf->stack; stack; stack = stack->prev)
+ {
+ if (extra == stack->prior.extra ||
+ extra == stack->masked.extra)
+ return true;
+ }
+
+ return false;
+}
+
+
+/*
+ * Support for assigning to an "extra" field of a GTM item. Free the prior
+ * value if it's not referenced anywhere else in the item (including stacked
+ * states).
+ */
+static void
+set_extra_field(struct config_generic * gconf, void **field, void *newval)
+{
+ void *oldval = *field;
+
+ /* Do the assignment */
+ *field = newval;
+
+ /* Free old value if it's not NULL and isn't referenced anymore */
+ if (oldval && !extra_field_used(gconf, oldval))
+ free(oldval);
+}
+
+
+/*
+ * Support for copying a variable's active value into a stack entry.
+ * The "extra" field associated with the active value is copied, too.
+ *
+ * NB: be sure stringval and extra fields of a new stack entry are
+ * initialized to NULL before this is used, else we'll try to free() them.
+ */
+static void
+set_stack_value(struct config_generic * gconf, config_var_value *val)
+{
+ switch (gconf->vartype)
+ {
+ case GTMC_BOOL:
+ val->val.boolval =
+ *((struct config_bool *) gconf)->variable;
+ break;
+ case GTMC_INT:
+ val->val.intval =
+ *((struct config_int *) gconf)->variable;
+ break;
+ case GTMC_REAL:
+ val->val.realval =
+ *((struct config_real *) gconf)->variable;
+ break;
+ case GTMC_STRING:
+ set_string_field((struct config_string *) gconf,
+ &(val->val.stringval),
+ *((struct config_string *) gconf)->variable);
+ break;
+ case GTMC_ENUM:
+ val->val.enumval =
+ *((struct config_enum *) gconf)->variable;
+ break;
+ }
+ set_extra_field(gconf, &(val->extra), gconf->extra);
+}
+
+#if 0
+/* PGXCTODO: This is let for future extension support */
+/*
+ * Support for discarding a no-longer-needed value in a stack entry.
+ * The "extra" field associated with the stack entry is cleared, too.
+ */
+static void
+discard_stack_value(struct config_generic * gconf, config_var_value *val)
+{
+ switch (gconf->vartype)
+ {
+ case GTMC_BOOL:
+ case GTMC_INT:
+ case GTMC_REAL:
+ case GTMC_ENUM:
+ /* no need to do anything */
+ break;
+ case GTMC_STRING:
+ set_string_field((struct config_string *) gconf,
+ &(val->val.stringval),
+ NULL);
+ break;
+ }
+ set_extra_field(gconf, &(val->extra), NULL);
+}
+#endif
+
+/*
+ * Fetch the sorted array pointer (exported for help_config.c's use ONLY)
+ */
+struct config_generic **
+get_gtm_opt_variables(void)
+{
+ return gtm_opt_variables;
+}
+
+/*
+ * Build the sorted array. This is split out so that it could be
+ * re-executed after startup (eg, we could allow loadable modules to
+ * add vars, and then we'd need to re-sort).
+ */
+void
+build_gtm_opt_variables(void)
+{
+ int size_vars;
+ int num_vars = 0;
+ struct config_generic **gtm_opt_vars;
+ int i;
+
+ for (i = 0; ConfigureNamesBool[i].gen.name; i++)
+ {
+ struct config_bool *conf = &ConfigureNamesBool[i];
+
+ /* Rather than requiring vartype to be filled in by hand, do this: */
+ conf->gen.vartype = GTMC_BOOL;
+ num_vars++;
+ }
+
+ for (i = 0; ConfigureNamesInt[i].gen.name; i++)
+ {
+ struct config_int *conf = &ConfigureNamesInt[i];
+
+ conf->gen.vartype = GTMC_INT;
+ num_vars++;
+ }
+
+ for (i = 0; ConfigureNamesReal[i].gen.name; i++)
+ {
+ struct config_real *conf = &ConfigureNamesReal[i];
+
+ conf->gen.vartype = GTMC_REAL;
+ num_vars++;
+ }
+
+ for (i = 0; ConfigureNamesString[i].gen.name; i++)
+ {
+ struct config_string *conf = &ConfigureNamesString[i];
+
+ conf->gen.vartype = GTMC_STRING;
+ num_vars++;
+ }
+
+ for (i = 0; ConfigureNamesEnum[i].gen.name; i++)
+ {
+ struct config_enum *conf = &ConfigureNamesEnum[i];
+
+ conf->gen.vartype = GTMC_ENUM;
+ num_vars++;
+ }
+
+ /*
+ * Create table with 20% slack
+ */
+ size_vars = num_vars + num_vars / 4;
+
+ gtm_opt_vars = (struct config_generic **)
+ gtm_opt_malloc(FATAL, size_vars * sizeof(struct config_generic *));
+
+ num_vars = 0;
+
+ for (i = 0; ConfigureNamesBool[i].gen.name; i++)
+ gtm_opt_vars[num_vars++] = &ConfigureNamesBool[i].gen;
+
+ for (i = 0; ConfigureNamesInt[i].gen.name; i++)
+ gtm_opt_vars[num_vars++] = &ConfigureNamesInt[i].gen;
+
+ for (i = 0; ConfigureNamesReal[i].gen.name; i++)
+ gtm_opt_vars[num_vars++] = &ConfigureNamesReal[i].gen;
+
+ for (i = 0; ConfigureNamesString[i].gen.name; i++)
+ gtm_opt_vars[num_vars++] = &ConfigureNamesString[i].gen;
+
+ for (i = 0; ConfigureNamesEnum[i].gen.name; i++)
+ gtm_opt_vars[num_vars++] = &ConfigureNamesEnum[i].gen;
+
+ if (gtm_opt_variables)
+ free(gtm_opt_variables);
+ gtm_opt_variables = gtm_opt_vars;
+ num_gtm_opt_variables = num_vars;
+ size_gtm_opt_variables = size_vars;
+ qsort((void *) gtm_opt_variables, num_gtm_opt_variables,
+ sizeof(struct config_generic *), gtm_opt_var_compare);
+}
+
+
+#if 0
+/* PGXCTODO: This is let for future extension support */
+/*
+ * Add a new GTM variable to the list of known variables. The
+ * list is expanded if needed.
+ */
+static bool
+add_gtm_opt_variable(struct config_generic * var, int elevel)
+{
+ if (num_gtm_opt_variables + 1 >= size_gtm_opt_variables)
+ {
+ /*
+ * Increase the vector by 25%
+ */
+ int size_vars = size_gtm_opt_variables + size_gtm_opt_variables / 4;
+ struct config_generic **gtm_opt_vars;
+
+ if (size_vars == 0)
+ {
+ size_vars = 100;
+ gtm_opt_vars = (struct config_generic **)
+ gtm_opt_malloc(elevel, size_vars * sizeof(struct config_generic *));
+ }
+ else
+ {
+ gtm_opt_vars = (struct config_generic **)
+ gtm_opt_realloc(elevel, gtm_opt_variables, size_vars * sizeof(struct config_generic *));
+ }
+
+ if (gtm_opt_vars == NULL)
+ return false; /* out of memory */
+
+ gtm_opt_variables = gtm_opt_vars;
+ size_gtm_opt_variables = size_vars;
+ }
+ gtm_opt_variables[num_gtm_opt_variables++] = var;
+ qsort((void *) gtm_opt_variables, num_gtm_opt_variables,
+ sizeof(struct config_generic *), gtm_opt_var_compare);
+ return true;
+}
+
+
+/*
+ * Create and add a placeholder variable. It's presumed to belong
+ * to a valid custom variable class at this point.
+ */
+static struct config_generic *
+add_placeholder_variable(const char *name, int elevel)
+{
+ size_t sz = sizeof(struct config_string) + sizeof(char *);
+ struct config_string *var;
+ struct config_generic *gen;
+
+ var = (struct config_string *) gtm_opt_malloc(elevel, sz);
+ if (var == NULL)
+ return NULL;
+ memset(var, 0, sz);
+ gen = &var->gen;
+
+ gen->name = gtm_opt_strdup(elevel, name);
+ if (gen->name == NULL)
+ {
+ free(var);
+ return NULL;
+ }
+
+ gen->context = GTMC_USERSET;
+ gen->short_desc = "GTM placeholder variable";
+ gen->flags = GTMOPT_NO_SHOW_ALL | GTMOPT_NOT_IN_SAMPLE | GTMOPT_CUSTOM_PLACEHOLDER;
+ gen->vartype = GTMC_STRING;
+
+ /*
+ * The char* is allocated at the end of the struct since we have no
+ * 'static' place to point to. Note that the current value, as well as
+ * the boot and reset values, start out NULL.
+ */
+ var->variable = (char **) (var + 1);
+
+ if (!add_gtm_opt_variable((struct config_generic *) var, elevel))
+ {
+ free((void *) gen->name);
+ free(var);
+ return NULL;
+ }
+
+ return gen;
+}
+#endif
+
+/*
+ * Look up option NAME. If it exists, return a pointer to its record,
+ * else return NULL. If create_placeholders is TRUE, we'll create a
+ * placeholder record for a valid-looking custom variable name.
+ */
+static struct config_generic *
+find_option(const char *name, bool create_placeholders, int elevel)
+{
+ const char **key = &name;
+ struct config_generic **res;
+
+ Assert(name);
+
+ /*
+ * By equating const char ** with struct config_generic *, we are assuming
+ * the name field is first in config_generic.
+ */
+ res = (struct config_generic **) bsearch((void *) &key,
+ (void *) gtm_opt_variables,
+ num_gtm_opt_variables,
+ sizeof(struct config_generic *),
+ gtm_opt_var_compare);
+ if (res)
+ return *res;
+
+ /* Unknown name */
+ return NULL;
+}
+
+
+/*
+ * comparator for qsorting and bsearching gtm_opt_variables array
+ */
+static int
+gtm_opt_var_compare(const void *a, const void *b)
+{
+ struct config_generic *confa = *(struct config_generic **) a;
+ struct config_generic *confb = *(struct config_generic **) b;
+
+ return gtm_opt_name_compare(confa->name, confb->name);
+}
+
+
+/*
+ * the bare comparison function for GTM names
+ */
+static int
+gtm_opt_name_compare(const char *namea, const char *nameb)
+{
+ /*
+ * The temptation to use strcasecmp() here must be resisted, because the
+ * array ordering has to remain stable across setlocale() calls. So, build
+ * our own with a simple ASCII-only downcasing.
+ */
+ while (*namea && *nameb)
+ {
+ char cha = *namea++;
+ char chb = *nameb++;
+
+ if (cha >= 'A' && cha <= 'Z')
+ cha += 'a' - 'A';
+ if (chb >= 'A' && chb <= 'Z')
+ chb += 'a' - 'A';
+ if (cha != chb)
+ return cha - chb;
+ }
+ if (*namea)
+ return 1; /* a is longer */
+ if (*nameb)
+ return -1; /* b is longer */
+ return 0;
+}
+
+
+/*
+ * Initialize GTM options during program startup.
+ *
+ * Note that we cannot read the config file yet, since we have not yet
+ * processed command-line switches.
+ */
+void
+InitializeGTMOptions(void)
+{
+ int i;
+
+ /*
+ * Build sorted array of all GTM variables.
+ */
+ build_gtm_opt_variables();
+
+ /*
+ * Load all variables with their compiled-in defaults, and initialize
+ * status fields as needed.
+ */
+ for (i = 0; i < num_gtm_opt_variables; i++)
+ {
+ InitializeOneGTMOption(gtm_opt_variables[i]);
+ }
+
+ reporting_enabled = false;
+
+}
+
+
+/*
+ * Initialize one GTM option variable to its compiled-in default.
+ *
+ * Note: the reason for calling check_hooks is not that we think the boot_val
+ * might fail, but that the hooks might wish to compute an "extra" struct.
+ */
+static void
+InitializeOneGTMOption(struct config_generic * gconf)
+{
+ gconf->status = 0;
+ gconf->reset_source = GTMC_S_DEFAULT;
+ gconf->source = GTMC_S_DEFAULT;
+ gconf->stack = NULL;
+ gconf->extra = NULL;
+ gconf->sourcefile = NULL;
+ gconf->sourceline = 0;
+ gconf->context = GTMC_DEFAULT;
+
+ switch (gconf->vartype)
+ {
+ case GTMC_BOOL:
+ {
+ struct config_bool *conf = (struct config_bool *) gconf;
+ bool newval = conf->boot_val;
+ void *extra = NULL;
+
+ *conf->variable = conf->reset_val = newval;
+ conf->gen.extra = conf->reset_extra = extra;
+ break;
+ }
+ case GTMC_INT:
+ {
+ struct config_int *conf = (struct config_int *) gconf;
+ int newval = conf->boot_val;
+ void *extra = NULL;
+
+ Assert(newval >= conf->min);
+ Assert(newval <= conf->max);
+ *conf->variable = conf->reset_val = newval;
+ conf->gen.extra = conf->reset_extra = extra;
+ break;
+ }
+ case GTMC_REAL:
+ {
+ struct config_real *conf = (struct config_real *) gconf;
+ double newval = conf->boot_val;
+ void *extra = NULL;
+
+ Assert(newval >= conf->min);
+ Assert(newval <= conf->max);
+ *conf->variable = conf->reset_val = newval;
+ conf->gen.extra = conf->reset_extra = extra;
+ break;
+ }
+ case GTMC_STRING:
+ {
+ struct config_string *conf = (struct config_string *) gconf;
+ char *newval;
+ void *extra = NULL;
+
+ /* non-NULL boot_val must always get strdup'd */
+ if (conf->boot_val != NULL)
+ newval = gtm_opt_strdup(FATAL, conf->boot_val);
+ else
+ newval = NULL;
+
+ *conf->variable = conf->reset_val = newval;
+ conf->gen.extra = conf->reset_extra = extra;
+ break;
+ }
+ case GTMC_ENUM:
+ {
+ struct config_enum *conf = (struct config_enum *) gconf;
+ int newval = conf->boot_val;
+ void *extra = NULL;
+
+ *conf->variable = conf->reset_val = newval;
+ conf->gen.extra = conf->reset_extra = extra;
+ break;
+ }
+ }
+}
+
+
+/*
+ * Select the configuration files and data directory to be used, and
+ * do the initial read of postgresql.conf.
+ *
+ * This is called after processing command-line switches.
+ * userDoption is the -D switch value if any (NULL if unspecified).
+ * progname is just for use in error messages.
+ *
+ * Returns true on success; on failure, prints a suitable error message
+ * to stderr and returns false.
+ */
+bool
+SelectConfigFiles(const char *userDoption, const char *progname)
+{
+ char *configdir;
+ char *fname;
+ struct stat stat_buf;
+
+ /* configdir is -D option, or $PGDATA if no -D */
+ if (userDoption)
+ configdir = make_absolute_path(userDoption);
+ else
+ configdir = NULL;
+
+ /*
+ * Find the configuration file: if config_file was specified on the
+ * command line, use it, else use configdir/postgresql.conf. In any case
+ * ensure the result is an absolute path, so that it will be interpreted
+ * the same way by future backends.
+ */
+ if (GTMConfigFileName)
+ {
+ if (GTMConfigFileName[0] == '/')
+ fname = make_absolute_path(GTMConfigFileName);
+ else
+ {
+ if (configdir)
+ {
+ fname = gtm_opt_malloc(FATAL,
+ strlen(configdir) + strlen(GTMConfigFileName) + 2);
+ sprintf(fname, "%s/%s", configdir, GTMConfigFileName);
+ }
+ else
+ fname = make_absolute_path(GTMConfigFileName);
+ }
+ }
+ else if (configdir)
+ {
+ fname = gtm_opt_malloc(FATAL,
+ strlen(configdir) + strlen(config_filename) + 2);
+ sprintf(fname, "%s/%s", configdir, config_filename);
+ }
+ else
+ {
+ write_stderr("%s does not know where to find the server configuration file.\n"
+ "You must specify the --config-file or -D invocation "
+ "option or set the PGDATA environment variable.\n",
+ progname);
+ return false;
+ }
+
+ /*
+ * Set the GTMConfigFileName GTM variable to its final value, ensuring that
+ * it can't be overridden later.
+ */
+ SetConfigOption("config_file", fname, GTMC_STARTUP, GTMC_S_OVERRIDE);
+ free(fname);
+
+ /*
+ * Now read the config file for the first time.
+ */
+ if (stat(GTMConfigFileName, &stat_buf) != 0)
+ {
+ write_stderr("%s cannot access the server configuration file \"%s\": %s\n",
+ progname, GTMConfigFileName, strerror(errno));
+ return false;
+ }
+
+ ProcessConfigFile(GTMC_STARTUP);
+
+ free(configdir);
+
+ return true;
+}
+
+/*
+ * Reset all options to their saved default values (implements RESET ALL)
+ */
+void
+ResetAllOptions(void)
+{
+ int i;
+
+ for (i = 0; i < num_gtm_opt_variables; i++)
+ {
+ struct config_generic *gconf = gtm_opt_variables[i];
+
+ /* Don't reset if special exclusion from RESET ALL */
+ if (gconf->flags & GTMOPT_NO_RESET_ALL)
+ continue;
+ /* No need to reset if wasn't SET */
+ if (gconf->source <= GTMC_S_OVERRIDE)
+ continue;
+
+ switch (gconf->vartype)
+ {
+ case GTMC_BOOL:
+ {
+ struct config_bool *conf = (struct config_bool *) gconf;
+
+ *conf->variable = conf->reset_val;
+ set_extra_field(&conf->gen, &conf->gen.extra,
+ conf->reset_extra);
+ break;
+ }
+ case GTMC_INT:
+ {
+ struct config_int *conf = (struct config_int *) gconf;
+
+ *conf->variable = conf->reset_val;
+ set_extra_field(&conf->gen, &conf->gen.extra,
+ conf->reset_extra);
+ break;
+ }
+ case GTMC_REAL:
+ {
+ struct config_real *conf = (struct config_real *) gconf;
+
+ *conf->variable = conf->reset_val;
+ set_extra_field(&conf->gen, &conf->gen.extra,
+ conf->reset_extra);
+ break;
+ }
+ case GTMC_STRING:
+ {
+ struct config_string *conf = (struct config_string *) gconf;
+
+ set_string_field(conf, conf->variable, conf->reset_val);
+ set_extra_field(&conf->gen, &conf->gen.extra,
+ conf->reset_extra);
+ break;
+ }
+ case GTMC_ENUM:
+ {
+ struct config_enum *conf = (struct config_enum *) gconf;
+
+ *conf->variable = conf->reset_val;
+ set_extra_field(&conf->gen, &conf->gen.extra,
+ conf->reset_extra);
+ break;
+ }
+ }
+
+ gconf->source = gconf->reset_source;
+
+ if (gconf->flags & GTMOPT_REPORT)
+ ReportGTMOption(gconf);
+ }
+}
+
+
+
+/*
+ * push_old_value
+ * Push previous state during transactional assignment to a GTM variable.
+ */
+static void
+push_old_value(struct config_generic * gconf)
+{
+ GtmOptStack *stack;
+
+ /* If we're not inside a nest level, do nothing */
+ if (GTMOptUpdateCount == 0)
+ return;
+
+ /* Do we already have a stack entry of the current nest level? */
+ stack = gconf->stack;
+ if (stack && stack->nest_level >= GTMOptUpdateCount)
+ return;
+
+ /*
+ * Push a new stack entry
+ *
+ * We keep all the stack entries in TopTransactionContext for simplicity.
+ */
+ stack = (GtmOptStack *) MemoryContextAllocZero(TopMemoryContext,
+ sizeof(GtmOptStack));
+
+ stack->prev = gconf->stack;
+ stack->nest_level = GTMOptUpdateCount;
+ stack->source = gconf->source;
+ set_stack_value(gconf, &stack->prior);
+
+ gconf->stack = stack;
+}
+
+
+
+/*
+ * Enter a new nesting level for GTM values. This is called at subtransaction
+ * start and when entering a function that has proconfig settings. NOTE that
+ * we must not risk error here, else subtransaction start will be unhappy.
+ */
+int
+NewGTMNestLevel(void)
+{
+ return ++GTMOptUpdateCount;
+}
+
+/*
+ * Try to parse value as an integer. The accepted formats are the
+ * usual decimal, octal, or hexadecimal formats, optionally followed by
+ * a unit name if "flags" indicates a unit is allowed.
+ *
+ * If the string parses okay, return true, else false.
+ * If okay and result is not NULL, return the value in *result.
+ * If not okay and hintmsg is not NULL, *hintmsg is set to a suitable
+ * HINT message, or NULL if no hint provided.
+ */
+bool
+parse_int(const char *value, int *result, int flags, const char **hintmsg)
+{
+ int64 val;
+ char *endptr;
+
+ /* To suppress compiler warnings, always set output params */
+ if (result)
+ *result = 0;
+ if (hintmsg)
+ *hintmsg = NULL;
+
+ /* We assume here that int64 is at least as wide as long */
+ errno = 0;
+ val = strtol(value, &endptr, 0);
+
+ if (endptr == value)
+ return false; /* no HINT for integer syntax error */
+
+ if (errno == ERANGE || val != (int64) ((int32) val))
+ {
+ if (hintmsg)
+ *hintmsg = gettext_noop("Value exceeds integer range.");
+ return false;
+ }
+
+ /* allow whitespace between integer and unit */
+ while (isspace((unsigned char) *endptr))
+ endptr++;
+
+ /* Handle possible unit */
+ if (*endptr != '\0')
+ {
+ /*
+ * Note: the multiple-switch coding technique here is a bit tedious,
+ * but seems necessary to avoid intermediate-value overflows.
+ */
+ if (flags & GTMOPT_UNIT_MEMORY)
+ {
+ /* Set hint for use if no match or trailing garbage */
+ if (hintmsg)
+ *hintmsg = gettext_noop("Valid units for this parameter are \"kB\", \"MB\", and \"GB\".");
+
+#if BLCKSZ < 1024 || BLCKSZ > (1024*1024)
+#error BLCKSZ must be between 1KB and 1MB
+#endif
+#if XLOG_BLCKSZ < 1024 || XLOG_BLCKSZ > (1024*1024)
+#error XLOG_BLCKSZ must be between 1KB and 1MB
+#endif
+
+ if (strncmp(endptr, "kB", 2) == 0)
+ {
+ endptr += 2;
+ switch (flags & GTMOPT_UNIT_MEMORY)
+ {
+ case GTMOPT_UNIT_BLOCKS:
+ val /= (BLCKSZ / 1024);
+ break;
+ case GTMOPT_UNIT_XBLOCKS:
+ val /= (XLOG_BLCKSZ / 1024);
+ break;
+ }
+ }
+ else if (strncmp(endptr, "MB", 2) == 0)
+ {
+ endptr += 2;
+ switch (flags & GTMOPT_UNIT_MEMORY)
+ {
+ case GTMOPT_UNIT_KB:
+ val *= KB_PER_MB;
+ break;
+ case GTMOPT_UNIT_BLOCKS:
+ val *= KB_PER_MB / (BLCKSZ / 1024);
+ break;
+ case GTMOPT_UNIT_XBLOCKS:
+ val *= KB_PER_MB / (XLOG_BLCKSZ / 1024);
+ break;
+ }
+ }
+ else if (strncmp(endptr, "GB", 2) == 0)
+ {
+ endptr += 2;
+ switch (flags & GTMOPT_UNIT_MEMORY)
+ {
+ case GTMOPT_UNIT_KB:
+ val *= KB_PER_GB;
+ break;
+ case GTMOPT_UNIT_BLOCKS:
+ val *= KB_PER_GB / (BLCKSZ / 1024);
+ break;
+ case GTMOPT_UNIT_XBLOCKS:
+ val *= KB_PER_GB / (XLOG_BLCKSZ / 1024);
+ break;
+ }
+ }
+ }
+ else if (flags & GTMOPT_UNIT_TIME)
+ {
+ /* Set hint for use if no match or trailing garbage */
+ if (hintmsg)
+ *hintmsg = gettext_noop("Valid units for this parameter are \"ms\", \"s\", \"min\", \"h\", and \"d\".");
+
+ if (strncmp(endptr, "ms", 2) == 0)
+ {
+ endptr += 2;
+ switch (flags & GTMOPT_UNIT_TIME)
+ {
+ case GTMOPT_UNIT_S:
+ val /= MS_PER_S;
+ break;
+ case GTMOPT_UNIT_MIN:
+ val /= MS_PER_MIN;
+ break;
+ }
+ }
+ else if (strncmp(endptr, "s", 1) == 0)
+ {
+ endptr += 1;
+ switch (flags & GTMOPT_UNIT_TIME)
+ {
+ case GTMOPT_UNIT_MS:
+ val *= MS_PER_S;
+ break;
+ case GTMOPT_UNIT_MIN:
+ val /= S_PER_MIN;
+ break;
+ }
+ }
+ else if (strncmp(endptr, "min", 3) == 0)
+ {
+ endptr += 3;
+ switch (flags & GTMOPT_UNIT_TIME)
+ {
+ case GTMOPT_UNIT_MS:
+ val *= MS_PER_MIN;
+ break;
+ case GTMOPT_UNIT_S:
+ val *= S_PER_MIN;
+ break;
+ }
+ }
+ else if (strncmp(endptr, "h", 1) == 0)
+ {
+ endptr += 1;
+ switch (flags & GTMOPT_UNIT_TIME)
+ {
+ case GTMOPT_UNIT_MS:
+ val *= MS_PER_H;
+ break;
+ case GTMOPT_UNIT_S:
+ val *= S_PER_H;
+ break;
+ case GTMOPT_UNIT_MIN:
+ val *= MIN_PER_H;
+ break;
+ }
+ }
+ else if (strncmp(endptr, "d", 1) == 0)
+ {
+ endptr += 1;
+ switch (flags & GTMOPT_UNIT_TIME)
+ {
+ case GTMOPT_UNIT_MS:
+ val *= MS_PER_D;
+ break;
+ case GTMOPT_UNIT_S:
+ val *= S_PER_D;
+ break;
+ case GTMOPT_UNIT_MIN:
+ val *= MIN_PER_D;
+ break;
+ }
+ }
+ }
+
+ /* allow whitespace after unit */
+ while (isspace((unsigned char) *endptr))
+ endptr++;
+
+ if (*endptr != '\0')
+ return false; /* appropriate hint, if any, already set */
+
+ /* Check for overflow due to units conversion */
+ if (val != (int64) ((int32) val))
+ {
+ if (hintmsg)
+ *hintmsg = gettext_noop("Value exceeds integer range.");
+ return false;
+ }
+ }
+
+ if (result)
+ *result = (int) val;
+ return true;
+}
+
+
+
+/*
+ * Try to parse value as a floating point number in the usual format.
+ * If the string parses okay, return true, else false.
+ * If okay and result is not NULL, return the value in *result.
+ */
+bool
+parse_real(const char *value, double *result)
+{
+ double val;
+ char *endptr;
+
+ if (result)
+ *result = 0; /* suppress compiler warning */
+
+ errno = 0;
+ val = strtod(value, &endptr);
+ if (endptr == value || errno == ERANGE)
+ return false;
+
+ /* allow whitespace after number */
+ while (isspace((unsigned char) *endptr))
+ endptr++;
+ if (*endptr != '\0')
+ return false;
+
+ if (result)
+ *result = val;
+ return true;
+}
+
+
+
+/*
+ * Lookup the value for an enum option with the selected name
+ * (case-insensitive).
+ * If the enum option is found, sets the retval value and returns
+ * true. If it's not found, return FALSE and retval is set to 0.
+ */
+bool
+config_enum_lookup_by_name(struct config_enum * record, const char *value,
+ int *retval)
+{
+ const struct config_enum_entry *entry;
+
+ for (entry = record->options; entry && entry->name; entry++)
+ {
+ if (pg_strcasecmp(value, entry->name) == 0)
+ {
+ *retval = entry->val;
+ return TRUE;
+ }
+ }
+
+ *retval = 0;
+ return FALSE;
+}
+
+
+
+/*
+ * Return a list of all available options for an enum, excluding
+ * hidden ones, separated by the given separator.
+ * If prefix is non-NULL, it is added before the first enum value.
+ * If suffix is non-NULL, it is added to the end of the string.
+ */
+static char *
+config_enum_get_options(struct config_enum * record, const char *prefix,
+ const char *suffix, const char *separator)
+{
+ const struct config_enum_entry *entry;
+ StringInfoData retstr;
+ int seplen;
+
+ initStringInfo(&retstr);
+ appendStringInfoString(&retstr, prefix);
+
+ seplen = strlen(separator);
+ for (entry = record->options; entry && entry->name; entry++)
+ {
+ if (!entry->hidden)
+ {
+ appendStringInfoString(&retstr, entry->name);
+ appendBinaryStringInfo(&retstr, separator, seplen);
+ }
+ }
+
+ /*
+ * All the entries may have been hidden, leaving the string empty if no
+ * prefix was given. This indicates a broken GTM setup, since there is no
+ * use for an enum without any values, so we just check to make sure we
+ * don't write to invalid memory instead of actually trying to do
+ * something smart with it.
+ */
+ if (retstr.len >= seplen)
+ {
+ /* Replace final separator */
+ retstr.data[retstr.len - seplen] = '\0';
+ retstr.len -= seplen;
+ }
+
+ appendStringInfoString(&retstr, suffix);
+
+ return retstr.data;
+}
+
+
+/*
+ * Sets option `name' to given value. The value should be a string
+ * which is going to be parsed and converted to the appropriate data
+ * type. The context and source parameters indicate in which context this
+ * function is being called so it can apply the access restrictions
+ * properly.
+ *
+ * If value is NULL, set the option to its default value (normally the
+ * reset_val, but if source == GTMC_S_DEFAULT we instead use the boot_val).
+ *
+ * action indicates whether to set the value globally in the session, locally
+ * to the current top transaction, or just for the duration of a function call.
+ *
+ * If changeVal is false then don't really set the option but do all
+ * the checks to see if it would work.
+ *
+ * If there is an error (non-existing option, invalid value) then an
+ * ereport(ERROR) is thrown *unless* this is called in a context where we
+ * don't want to ereport (currently, startup or SIGHUP config file reread).
+ * In that case we write a suitable error message via ereport(LOG) and
+ * return false. This is working around the deficiencies in the ereport
+ * mechanism, so don't blame me. In all other cases, the function
+ * returns true, including cases where the input is valid but we chose
+ * not to apply it because of context or source-priority considerations.
+ *
+ * See also SetConfigOption for an external interface.
+ */
+bool
+set_config_option(const char *name, const char *value,
+ GtmOptContext context, GtmOptSource source,
+ bool changeVal)
+{
+ struct config_generic *record;
+ int elevel;
+ bool prohibitValueChange = false;
+ bool makeDefault;
+
+ if (context == GTMC_SIGHUP || source == GTMC_S_DEFAULT)
+ {
+ /*
+ * To avoid cluttering the log, only the postmaster bleats loudly
+ * about problems with the config file.
+ */
+ elevel = DEBUG3;
+ }
+ else if (source == GTMC_S_DATABASE || source == GTMC_S_USER ||
+ source == GTMC_S_DATABASE_USER)
+ elevel = WARNING;
+ else
+ elevel = ERROR;
+
+ record = find_option(name, true, elevel);
+ if (record == NULL)
+ {
+ if (isStartUp)
+ {
+ write_stderr("unrecognized configuration parameter \"%s\"\n", name);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("unrecognized configuration parameter \"%s\"", name)));
+ }
+ return false;
+ }
+
+ /*
+ * If source is postgresql.conf, mark the found record with
+ * GTMOPT_IS_IN_FILE. This is for the convenience of ProcessConfigFile. Note
+ * that we do it even if changeVal is false, since ProcessConfigFile wants
+ * the marking to occur during its testing pass.
+ */
+ if (source == GTMC_S_FILE)
+ record->status |= GTMOPT_IS_IN_FILE;
+
+ /*
+ * Check if the option can be set at this time. See guc.h for the precise
+ * rules.
+ */
+ switch (record->context)
+ {
+ case GTMC_DEFAULT:
+ case GTMC_STARTUP:
+ if (context == GTMC_SIGHUP)
+ {
+ /*
+ * We are re-reading a GTMC_POSTMASTER variable from
+ * postgresql.conf. We can't change the setting, so we should
+ * give a warning if the DBA tries to change it. However,
+ * because of variant formats, canonicalization by check
+ * hooks, etc, we can't just compare the given string directly
+ * to what's stored. Set a flag to check below after we have
+ * the final storable value.
+ *
+ * During the "checking" pass we just do nothing, to avoid
+ * printing the warning twice.
+ */
+ if (!changeVal)
+ return true;
+
+ prohibitValueChange = true;
+ }
+ else if (context != GTMC_STARTUP)
+ {
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" cannot be changed without restarting the server\n",
+ name);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("parameter \"%s\" cannot be changed without restarting the server",
+ name)));
+ }
+ return false;
+ }
+ break;
+ case GTMC_SIGHUP:
+ if (context != GTMC_SIGHUP && context != GTMC_STARTUP)
+ {
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" cannot be changed now\n",
+ name);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("parameter \"%s\" cannot be changed now",
+ name)));
+ }
+ return false;
+ }
+
+ /*
+ * Hmm, the idea of the SIGHUP context is "ought to be global, but
+ * can be changed after postmaster start". But there's nothing
+ * that prevents a crafty administrator from sending SIGHUP
+ * signals to individual backends only.
+ */
+ break;
+ default:
+ if (isStartUp)
+ {
+ write_stderr("GtmOptContext invalid (%d)\n",
+ context);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("GtmOptContext invalid (%d)",
+ context)));
+ }
+ return false;
+ }
+
+ /*
+ * Should we set reset/stacked values? (If so, the behavior is not
+ * transactional.) This is done either when we get a default value from
+ * the database's/user's/client's default settings or when we reset a
+ * value to its default.
+ */
+ makeDefault = changeVal && (source <= GTMC_S_OVERRIDE) &&
+ ((value != NULL) || source == GTMC_S_DEFAULT);
+
+ /*
+ * Ignore attempted set if overridden by previously processed setting.
+ * However, if changeVal is false then plow ahead anyway since we are
+ * trying to find out if the value is potentially good, not actually use
+ * it. Also keep going if makeDefault is true, since we may want to set
+ * the reset/stacked values even if we can't set the variable itself.
+ */
+ if (record->source > source)
+ {
+ if (changeVal && !makeDefault)
+ {
+ if (isStartUp)
+ {
+ write_stderr("\"%s\": setting ignored because previous source is higher priority\n",
+ name);
+ }
+ else
+ {
+ elog(DEBUG3, "\"%s\": setting ignored because previous source is higher priority",
+ name);
+ }
+ return true;
+ }
+ changeVal = false;
+ }
+
+ /*
+ * Evaluate value and set variable.
+ */
+ switch (record->vartype)
+ {
+ case GTMC_BOOL:
+ {
+ struct config_bool *conf = (struct config_bool *) record;
+ bool newval;
+ void *newextra = NULL;
+
+ if (value)
+ {
+ if (!gtm_opt_parse_bool(value, &newval))
+ {
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" requires a Boolean value\n",
+ name);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("parameter \"%s\" requires a Boolean value",
+ name)));
+ }
+ return false;
+ }
+ }
+ else if (source == GTMC_S_DEFAULT)
+ {
+ newval = conf->boot_val;
+ }
+ else
+ {
+ newval = conf->reset_val;
+ newextra = conf->reset_extra;
+ source = conf->gen.reset_source;
+ }
+
+ if (prohibitValueChange)
+ {
+ if (*conf->variable != newval)
+ {
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" cannot be changed without restarting the server\n",
+ name);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("parameter \"%s\" cannot be changed without restarting the server",
+ name)));
+ }
+ }
+ return false;
+ }
+
+ if (changeVal)
+ {
+ /* Save old value to support transaction abort */
+ if (!makeDefault)
+ push_old_value(&conf->gen);
+
+ *conf->variable = newval;
+ set_extra_field(&conf->gen, &conf->gen.extra,
+ newextra);
+ conf->gen.source = source;
+ }
+ if (makeDefault)
+ {
+ GtmOptStack *stack;
+
+ if (conf->gen.reset_source <= source)
+ {
+ conf->reset_val = newval;
+ set_extra_field(&conf->gen, &conf->reset_extra,
+ newextra);
+ conf->gen.reset_source = source;
+ }
+ for (stack = conf->gen.stack; stack; stack = stack->prev)
+ {
+ if (stack->source <= source)
+ {
+ stack->prior.val.boolval = newval;
+ set_extra_field(&conf->gen, &stack->prior.extra,
+ newextra);
+ stack->source = source;
+ }
+ }
+ }
+
+ /* Perhaps we didn't install newextra anywhere */
+ if (newextra && !extra_field_used(&conf->gen, newextra))
+ free(newextra);
+ break;
+ }
+
+ case GTMC_INT:
+ {
+ struct config_int *conf = (struct config_int *) record;
+ int newval;
+ void *newextra = NULL;
+
+ if (value)
+ {
+ const char *hintmsg;
+
+ if (!parse_int(value, &newval, conf->gen.flags, &hintmsg))
+ {
+ if (isStartUp)
+ {
+ write_stderr("invalid value for parameter \"%s\": \"%s\"\n",
+ name, value);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("invalid value for parameter \"%s\": \"%s\"",
+ name, value),
+ hintmsg ? errhint("%s", _(hintmsg)) : 0));
+ }
+ return false;
+ }
+ if (newval < conf->min || newval > conf->max)
+ {
+ if (isStartUp)
+ {
+ write_stderr("%d is outside the valid range for parameter \"%s\" (%d .. %d)\n",
+ newval, name, conf->min, conf->max);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)",
+ newval, name, conf->min, conf->max)));
+ }
+ return false;
+ }
+ }
+ else if (source == GTMC_S_DEFAULT)
+ {
+ newval = conf->boot_val;
+ }
+ else
+ {
+ newval = conf->reset_val;
+ newextra = conf->reset_extra;
+ source = conf->gen.reset_source;
+ }
+
+ if (prohibitValueChange)
+ {
+ if (*conf->variable != newval)
+ {
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" cannot be changed without restarting the server\n",
+ name);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("parameter \"%s\" cannot be changed without restarting the server",
+ name)));
+ }
+ }
+ return false;
+ }
+
+ if (changeVal)
+ {
+ /* Save old value to support transaction abort */
+ if (!makeDefault)
+ push_old_value(&conf->gen);
+
+ *conf->variable = newval;
+ set_extra_field(&conf->gen, &conf->gen.extra,
+ newextra);
+ conf->gen.source = source;
+ }
+ if (makeDefault)
+ {
+ GtmOptStack *stack;
+
+ if (conf->gen.reset_source <= source)
+ {
+ conf->reset_val = newval;
+ set_extra_field(&conf->gen, &conf->reset_extra,
+ newextra);
+ conf->gen.reset_source = source;
+ }
+ for (stack = conf->gen.stack; stack; stack = stack->prev)
+ {
+ if (stack->source <= source)
+ {
+ stack->prior.val.intval = newval;
+ set_extra_field(&conf->gen, &stack->prior.extra,
+ newextra);
+ stack->source = source;
+ }
+ }
+ }
+
+ /* Perhaps we didn't install newextra anywhere */
+ if (newextra && !extra_field_used(&conf->gen, newextra))
+ free(newextra);
+ break;
+ }
+
+ case GTMC_REAL:
+ {
+ struct config_real *conf = (struct config_real *) record;
+ double newval;
+ void *newextra = NULL;
+
+ if (value)
+ {
+ if (!parse_real(value, &newval))
+ {
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" requires a numeric value\n",
+ name);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("parameter \"%s\" requires a numeric value",
+ name)));
+ }
+ return false;
+ }
+ if (newval < conf->min || newval > conf->max)
+ {
+ if (isStartUp)
+ {
+ write_stderr("%g is outside the valid range for parameter \"%s\" (%g .. %g)\n",
+ newval, name, conf->min, conf->max);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("%g is outside the valid range for parameter \"%s\" (%g .. %g)",
+ newval, name, conf->min, conf->max)));
+ }
+ return false;
+ }
+ }
+ else if (source == GTMC_S_DEFAULT)
+ {
+ newval = conf->boot_val;
+ }
+ else
+ {
+ newval = conf->reset_val;
+ newextra = conf->reset_extra;
+ source = conf->gen.reset_source;
+ }
+
+ if (prohibitValueChange)
+ {
+ if (*conf->variable != newval)
+ {
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" cannot be changed without restarting the server\n",
+ name);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("parameter \"%s\" cannot be changed without restarting the server",
+ name)));
+ }
+ }
+ return false;
+ }
+
+ if (changeVal)
+ {
+ /* Save old value to support transaction abort */
+ if (!makeDefault)
+ push_old_value(&conf->gen);
+
+ *conf->variable = newval;
+ set_extra_field(&conf->gen, &conf->gen.extra,
+ newextra);
+ conf->gen.source = source;
+ }
+ if (makeDefault)
+ {
+ GtmOptStack *stack;
+
+ if (conf->gen.reset_source <= source)
+ {
+ conf->reset_val = newval;
+ set_extra_field(&conf->gen, &conf->reset_extra,
+ newextra);
+ conf->gen.reset_source = source;
+ }
+ for (stack = conf->gen.stack; stack; stack = stack->prev)
+ {
+ if (stack->source <= source)
+ {
+ stack->prior.val.realval = newval;
+ set_extra_field(&conf->gen, &stack->prior.extra,
+ newextra);
+ stack->source = source;
+ }
+ }
+ }
+
+ /* Perhaps we didn't install newextra anywhere */
+ if (newextra && !extra_field_used(&conf->gen, newextra))
+ free(newextra);
+ break;
+ }
+
+ case GTMC_STRING:
+ {
+ struct config_string *conf = (struct config_string *) record;
+ char *newval;
+ void *newextra = NULL;
+
+ if (value)
+ {
+ /*
+ * The value passed by the caller could be transient, so
+ * we always strdup it.
+ */
+ newval = gtm_opt_strdup(elevel, value);
+ if (newval == NULL)
+ return false;
+ }
+ else if (source == GTMC_S_DEFAULT)
+ {
+ /* non-NULL boot_val must always get strdup'd */
+ if (conf->boot_val != NULL)
+ {
+ newval = gtm_opt_strdup(elevel, conf->boot_val);
+ if (newval == NULL)
+ return false;
+ }
+ else
+ newval = NULL;
+
+ }
+ else
+ {
+ /*
+ * strdup not needed, since reset_val is already under
+ * guc.c's control
+ */
+ newval = conf->reset_val;
+ newextra = conf->reset_extra;
+ source = conf->gen.reset_source;
+ }
+
+ if (prohibitValueChange)
+ {
+ /* newval shouldn't be NULL, so we're a bit sloppy here */
+ if (*conf->variable == NULL || newval == NULL ||
+ strcmp(*conf->variable, newval) != 0)
+ {
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" cannot be changed without restarting the server\n",
+ name);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("parameter \"%s\" cannot be changed without restarting the server",
+ name)));
+ }
+ }
+ return false;
+ }
+
+ if (changeVal)
+ {
+ /* Save old value to support transaction abort */
+ if (!makeDefault)
+ push_old_value(&conf->gen);
+
+ set_string_field(conf, conf->variable, newval);
+ set_extra_field(&conf->gen, &conf->gen.extra,
+ newextra);
+ conf->gen.source = source;
+ }
+
+ if (makeDefault)
+ {
+ GtmOptStack *stack;
+
+ if (conf->gen.reset_source <= source)
+ {
+ set_string_field(conf, &conf->reset_val, newval);
+ set_extra_field(&conf->gen, &conf->reset_extra,
+ newextra);
+ conf->gen.reset_source = source;
+ }
+ for (stack = conf->gen.stack; stack; stack = stack->prev)
+ {
+ if (stack->source <= source)
+ {
+ set_string_field(conf, &stack->prior.val.stringval,
+ newval);
+ set_extra_field(&conf->gen, &stack->prior.extra,
+ newextra);
+ stack->source = source;
+ }
+ }
+ }
+
+ /* Perhaps we didn't install newval anywhere */
+ if (newval && !string_field_used(conf, newval))
+ free(newval);
+ /* Perhaps we didn't install newextra anywhere */
+ if (newextra && !extra_field_used(&conf->gen, newextra))
+ free(newextra);
+ break;
+ }
+
+ case GTMC_ENUM:
+ {
+ struct config_enum *conf = (struct config_enum *) record;
+ int newval;
+ void *newextra = NULL;
+
+ if (value)
+ {
+ if (!config_enum_lookup_by_name(conf, value, &newval))
+ {
+ char *hintmsg;
+
+ hintmsg = config_enum_get_options(conf,
+ "Available values: ",
+ ".", ", ");
+
+ if (isStartUp)
+ {
+ write_stderr("invalid value for parameter \"%s\": \"%s\". %s\n",
+ name, value, hintmsg);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("invalid value for parameter \"%s\": \"%s\"",
+ name, value),
+ hintmsg ? errhint("%s", _(hintmsg)) : 0));
+ }
+
+ if (hintmsg)
+ free(hintmsg);
+ return false;
+ }
+ }
+ else if (source == GTMC_S_DEFAULT)
+ {
+ newval = conf->boot_val;
+ }
+ else
+ {
+ newval = conf->reset_val;
+ newextra = conf->reset_extra;
+ source = conf->gen.reset_source;
+ }
+
+ if (prohibitValueChange)
+ {
+ if (*conf->variable != newval)
+ {
+ if (isStartUp)
+ {
+ write_stderr("parameter \"%s\" cannot be changed without restarting the server\n",
+ name);
+ }
+ else
+ {
+ ereport(elevel,
+ (0,
+ errmsg("parameter \"%s\" cannot be changed without restarting the server",
+ name)));
+ }
+ }
+ return false;
+ }
+
+ if (changeVal)
+ {
+ /* Save old value to support transaction abort */
+ if (!makeDefault)
+ push_old_value(&conf->gen);
+
+ *conf->variable = newval;
+ set_extra_field(&conf->gen, &conf->gen.extra,
+ newextra);
+ conf->gen.source = source;
+ }
+ if (makeDefault)
+ {
+ GtmOptStack *stack;
+
+ if (conf->gen.reset_source <= source)
+ {
+ conf->reset_val = newval;
+ set_extra_field(&conf->gen, &conf->reset_extra,
+ newextra);
+ conf->gen.reset_source = source;
+ }
+ for (stack = conf->gen.stack; stack; stack = stack->prev)
+ {
+ if (stack->source <= source)
+ {
+ stack->prior.val.enumval = newval;
+ set_extra_field(&conf->gen, &stack->prior.extra,
+ newextra);
+ stack->source = source;
+ }
+ }
+ }
+
+ /* Perhaps we didn't install newextra anywhere */
+ if (newextra && !extra_field_used(&conf->gen, newextra))
+ free(newextra);
+ break;
+ }
+ }
+
+ if (changeVal && (record->flags & GTMOPT_REPORT))
+ ReportGTMOption(record);
+
+ return true;
+}
+
+
+
+
+/*
+ * Set the fields for source file and line number the setting came from.
+ */
+static void
+set_config_sourcefile(const char *name, char *sourcefile, int sourceline)
+{
+ struct config_generic *record;
+ int elevel;
+
+ /*
+ * To avoid cluttering the log, only the postmaster bleats loudly about
+ * problems with the config file.
+ */
+ elevel = DEBUG3;
+
+ record = find_option(name, true, elevel);
+ /* should not happen */
+ if (record == NULL)
+ {
+ if (isStartUp)
+ write_stderr("unrecognized configuration parameter \"%s\"\n", name);
+ else
+ elog(ERROR, "unrecognized configuration parameter \"%s\"", name);
+ }
+
+ sourcefile = gtm_opt_strdup(elevel, sourcefile);
+ if (record->sourcefile)
+ free(record->sourcefile);
+ record->sourcefile = sourcefile;
+ record->sourceline = sourceline;
+}
+
+
+/*
+ * Set a config option to the given value. See also set_config_option,
+ * this is just the wrapper to be called from outside GTM. NB: this
+ * is used only for non-transactional operations.
+ *
+ * Note: there is no support here for setting source file/line, as it
+ * is currently not needed.
+ */
+void
+SetConfigOption(const char *name, const char *value,
+ GtmOptContext context, GtmOptSource source)
+{
+ (void) set_config_option(name, value, context, source,
+ true);
+}
+
+
+
+
+/*
+ * Fetch the current value of the option `name'. If the option doesn't exist,
+ * throw an ereport and don't return.
+ *
+ * If restrict_superuser is true, we also enforce that only superusers can
+ * see GTMOPT_SUPERUSER_ONLY variables. This should only be passed as true
+ * in user-driven calls.
+ *
+ * The string is *not* allocated for modification and is really only
+ * valid until the next call to configuration related functions.
+ */
+const char *
+GetConfigOption(const char *name, bool restrict_superuser)
+{
+ struct config_generic *record;
+ static char buffer[256];
+
+ record = find_option(name, false, ERROR);
+ if (record == NULL)
+ {
+ if (isStartUp)
+ write_stderr("unrecognized configuration parameter \"%s\"\n", name);
+ else
+ ereport(ERROR,
+ (0,
+ errmsg("unrecognized configuration parameter \"%s\"", name)));
+ }
+ switch (record->vartype)
+ {
+ case GTMC_BOOL:
+ return *((struct config_bool *) record)->variable ? "on" : "off";
+
+ case GTMC_INT:
+ snprintf(buffer, sizeof(buffer), "%d",
+ *((struct config_int *) record)->variable);
+ return buffer;
+
+ case GTMC_REAL:
+ snprintf(buffer, sizeof(buffer), "%g",
+ *((struct config_real *) record)->variable);
+ return buffer;
+
+ case GTMC_STRING:
+ return *((struct config_string *) record)->variable;
+
+ case GTMC_ENUM:
+ return config_enum_lookup_by_value((struct config_enum *) record,
+ *((struct config_enum *) record)->variable);
+ }
+ return NULL;
+}
+
+
+/*
+ * Get the RESET value associated with the given option.
+ *
+ * Note: this is not re-entrant, due to use of static result buffer;
+ * not to mention that a string variable could have its reset_val changed.
+ * Beware of assuming the result value is good for very long.
+ */
+const char *
+GetConfigOptionResetString(const char *name)
+{
+ struct config_generic *record;
+ static char buffer[256];
+
+ record = find_option(name, false, ERROR);
+ if (record == NULL)
+ {
+ if (isStartUp)
+ write_stderr("unrecognized configuration parameter \"%s\"\n", name);
+ else
+ ereport(ERROR,
+ (0,
+ errmsg("unrecognized configuration parameter \"%s\"", name)));
+ }
+
+ switch (record->vartype)
+ {
+ case GTMC_BOOL:
+ return ((struct config_bool *) record)->reset_val ? "on" : "off";
+
+ case GTMC_INT:
+ snprintf(buffer, sizeof(buffer), "%d",
+ ((struct config_int *) record)->reset_val);
+ return buffer;
+
+ case GTMC_REAL:
+ snprintf(buffer, sizeof(buffer), "%g",
+ ((struct config_real *) record)->reset_val);
+ return buffer;
+
+ case GTMC_STRING:
+ return ((struct config_string *) record)->reset_val;
+
+ case GTMC_ENUM:
+ return config_enum_lookup_by_value((struct config_enum *) record,
+ ((struct config_enum *) record)->reset_val);
+ }
+ return NULL;
+}
+
+
+void
+EmitWarningsOnPlaceholders(const char *className)
+{
+ int classLen = strlen(className);
+ int i;
+
+ for (i = 0; i < num_gtm_opt_variables; i++)
+ {
+ struct config_generic *var = gtm_opt_variables[i];
+
+ if ((var->flags & GTMOPT_CUSTOM_PLACEHOLDER) != 0 &&
+ strncmp(className, var->name, classLen) == 0 &&
+ var->name[classLen] == GTMOPT_QUALIFIER_SEPARATOR)
+ {
+ if (isStartUp)
+ write_stderr("unrecognized configuration parameter \"%s\"\n",
+ var->name);
+ else
+ ereport(WARNING,
+ (0,
+ errmsg("unrecognized configuration parameter \"%s\"",
+ var->name)));
+ }
+ }
+}
+
+
+/*
+ * Return GTM variable value by name; optionally return canonical
+ * form of name. Return value is malloc'd.
+ */
+char *
+GetConfigOptionByName(const char *name, const char **varname)
+{
+ struct config_generic *record;
+
+ record = find_option(name, false, ERROR);
+ if (record == NULL)
+ {
+ if (isStartUp)
+ write_stderr("unrecognized configuration parameter \"%s\"\n", name);
+ else
+ ereport(ERROR,
+ (0,
+ errmsg("unrecognized configuration parameter \"%s\"", name)));
+ }
+ if (varname)
+ *varname = record->name;
+
+ return _ShowOption(record, true);
+}
+
+/*
+ * Return GTM variable value by variable number; optionally return canonical
+ * form of name. Return value is malloc'd.
+ */
+void
+GetConfigOptionByNum(int varnum, const char **values, bool *noshow)
+{
+ char buffer[256];
+ struct config_generic *conf;
+
+ /* check requested variable number valid */
+ Assert((varnum >= 0) && (varnum < num_gtm_opt_variables));
+
+ conf = gtm_opt_variables[varnum];
+
+ if (noshow)
+ {
+ if (conf->flags & GTMOPT_NO_SHOW_ALL)
+ *noshow = true;
+ else
+ *noshow = false;
+ }
+
+ /* first get the generic attributes */
+
+ /* name */
+ values[0] = conf->name;
+
+ /* setting : use _ShowOption in order to avoid duplicating the logic */
+ values[1] = _ShowOption(conf, false);
+
+ /* unit */
+ if (conf->vartype == GTMC_INT)
+ {
+ static char buf[8];
+
+ switch (conf->flags & (GTMOPT_UNIT_MEMORY | GTMOPT_UNIT_TIME))
+ {
+ case GTMOPT_UNIT_KB:
+ values[2] = "kB";
+ break;
+ case GTMOPT_UNIT_BLOCKS:
+ snprintf(buf, sizeof(buf), "%dkB", BLCKSZ / 1024);
+ values[2] = buf;
+ break;
+ case GTMOPT_UNIT_XBLOCKS:
+ snprintf(buf, sizeof(buf), "%dkB", XLOG_BLCKSZ / 1024);
+ values[2] = buf;
+ break;
+ case GTMOPT_UNIT_MS:
+ values[2] = "ms";
+ break;
+ case GTMOPT_UNIT_S:
+ values[2] = "s";
+ break;
+ case GTMOPT_UNIT_MIN:
+ values[2] = "min";
+ break;
+ default:
+ values[2] = "";
+ break;
+ }
+ }
+ else
+ values[2] = NULL;
+
+#if 0
+ /* PGXCTODO: Group parameters are not used yet */
+ /* group */
+ values[3] = config_group_names[conf->group];
+#endif
+
+ /* short_desc */
+ values[4] = conf->short_desc;
+
+ /* extra_desc */
+ values[5] = conf->long_desc;
+
+ /* context */
+ values[6] = GtmOptContext_Names[conf->context];
+
+ /* vartype */
+ values[7] = config_type_names[conf->vartype];
+
+ /* source */
+ values[8] = GtmOptSource_Names[conf->source];
+
+ /* now get the type specifc attributes */
+ switch (conf->vartype)
+ {
+ case GTMC_BOOL:
+ {
+ struct config_bool *lconf = (struct config_bool *) conf;
+
+ /* min_val */
+ values[9] = NULL;
+
+ /* max_val */
+ values[10] = NULL;
+
+ /* enumvals */
+ values[11] = NULL;
+
+ /* boot_val */
+ values[12] = strdup(lconf->boot_val ? "on" : "off");
+
+ /* reset_val */
+ values[13] = strdup(lconf->reset_val ? "on" : "off");
+ }
+ break;
+
+ case GTMC_INT:
+ {
+ struct config_int *lconf = (struct config_int *) conf;
+
+ /* min_val */
+ snprintf(buffer, sizeof(buffer), "%d", lconf->min);
+ values[9] = strdup(buffer);
+
+ /* max_val */
+ snprintf(buffer, sizeof(buffer), "%d", lconf->max);
+ values[10] = strdup(buffer);
+
+ /* enumvals */
+ values[11] = NULL;
+
+ /* boot_val */
+ snprintf(buffer, sizeof(buffer), "%d", lconf->boot_val);
+ values[12] = strdup(buffer);
+
+ /* reset_val */
+ snprintf(buffer, sizeof(buffer), "%d", lconf->reset_val);
+ values[13] = strdup(buffer);
+ }
+ break;
+
+ case GTMC_REAL:
+ {
+ struct config_real *lconf = (struct config_real *) conf;
+
+ /* min_val */
+ snprintf(buffer, sizeof(buffer), "%g", lconf->min);
+ values[9] = strdup(buffer);
+
+ /* max_val */
+ snprintf(buffer, sizeof(buffer), "%g", lconf->max);
+ values[10] = strdup(buffer);
+
+ /* enumvals */
+ values[11] = NULL;
+
+ /* boot_val */
+ snprintf(buffer, sizeof(buffer), "%g", lconf->boot_val);
+ values[12] = strdup(buffer);
+
+ /* reset_val */
+ snprintf(buffer, sizeof(buffer), "%g", lconf->reset_val);
+ values[13] = strdup(buffer);
+ }
+ break;
+
+ case GTMC_STRING:
+ {
+ struct config_string *lconf = (struct config_string *) conf;
+
+ /* min_val */
+ values[9] = NULL;
+
+ /* max_val */
+ values[10] = NULL;
+
+ /* enumvals */
+ values[11] = NULL;
+
+ /* boot_val */
+ if (lconf->boot_val == NULL)
+ values[12] = NULL;
+ else
+ values[12] = strdup(lconf->boot_val);
+
+ /* reset_val */
+ if (lconf->reset_val == NULL)
+ values[13] = NULL;
+ else
+ values[13] = strdup(lconf->reset_val);
+ }
+ break;
+
+ case GTMC_ENUM:
+ {
+ struct config_enum *lconf = (struct config_enum *) conf;
+
+ /* min_val */
+ values[9] = NULL;
+
+ /* max_val */
+ values[10] = NULL;
+
+ /* enumvals */
+
+ /*
+ * NOTE! enumvals with double quotes in them are not
+ * supported!
+ */
+ values[11] = config_enum_get_options((struct config_enum *) conf,
+ "{\"", "\"}", "\",\"");
+
+ /* boot_val */
+ values[12] = strdup(config_enum_lookup_by_value(lconf,
+ lconf->boot_val));
+
+ /* reset_val */
+ values[13] = strdup(config_enum_lookup_by_value(lconf,
+ lconf->reset_val));
+ }
+ break;
+
+ default:
+ {
+ /*
+ * should never get here, but in case we do, set 'em to NULL
+ */
+
+ /* min_val */
+ values[9] = NULL;
+
+ /* max_val */
+ values[10] = NULL;
+
+ /* enumvals */
+ values[11] = NULL;
+
+ /* boot_val */
+ values[12] = NULL;
+
+ /* reset_val */
+ values[13] = NULL;
+ }
+ break;
+ }
+
+ /*
+ * If the setting came from a config file, set the source location. For
+ * security reasons, we don't show source file/line number for
+ * non-superusers.
+ */
+ if (conf->source == GTMC_S_FILE)
+ {
+ values[14] = conf->sourcefile;
+ snprintf(buffer, sizeof(buffer), "%d", conf->sourceline);
+ values[15] = strdup(buffer);
+ }
+ else
+ {
+ values[14] = NULL;
+ values[15] = NULL;
+ }
+}
+
+/*
+ * Return the total number of GTM variables
+ */
+int
+GetNumConfigOptions(void)
+{
+ return num_gtm_opt_variables;
+}
+
+
+static char *
+_ShowOption(struct config_generic * record, bool use_units)
+{
+ char buffer[256];
+ const char *val;
+
+ switch (record->vartype)
+ {
+ case GTMC_BOOL:
+ {
+ struct config_bool *conf = (struct config_bool *) record;
+
+ val = *conf->variable ? "on" : "off";
+ }
+ break;
+
+ case GTMC_INT:
+ {
+ struct config_int *conf = (struct config_int *) record;
+
+ /*
+ * Use int64 arithmetic to avoid overflows in units
+ * conversion.
+ */
+ int64 result = *conf->variable;
+ const char *unit;
+
+ if (use_units && result > 0 &&
+ (record->flags & GTMOPT_UNIT_MEMORY))
+ {
+ switch (record->flags & GTMOPT_UNIT_MEMORY)
+ {
+ case GTMOPT_UNIT_BLOCKS:
+ result *= BLCKSZ / 1024;
+ break;
+ case GTMOPT_UNIT_XBLOCKS:
+ result *= XLOG_BLCKSZ / 1024;
+ break;
+ }
+
+ if (result % KB_PER_GB == 0)
+ {
+ result /= KB_PER_GB;
+ unit = "GB";
+ }
+ else if (result % KB_PER_MB == 0)
+ {
+ result /= KB_PER_MB;
+ unit = "MB";
+ }
+ else
+ {
+ unit = "kB";
+ }
+ }
+ else if (use_units && result > 0 &&
+ (record->flags & GTMOPT_UNIT_TIME))
+ {
+ switch (record->flags & GTMOPT_UNIT_TIME)
+ {
+ case GTMOPT_UNIT_S:
+ result *= MS_PER_S;
+ break;
+ case GTMOPT_UNIT_MIN:
+ result *= MS_PER_MIN;
+ break;
+ }
+
+ if (result % MS_PER_D == 0)
+ {
+ result /= MS_PER_D;
+ unit = "d";
+ }
+ else if (result % MS_PER_H == 0)
+ {
+ result /= MS_PER_H;
+ unit = "h";
+ }
+ else if (result % MS_PER_MIN == 0)
+ {
+ result /= MS_PER_MIN;
+ unit = "min";
+ }
+ else if (result % MS_PER_S == 0)
+ {
+ result /= MS_PER_S;
+ unit = "s";
+ }
+ else
+ {
+ unit = "ms";
+ }
+ }
+ else
+ unit = "";
+
+ snprintf(buffer, sizeof(buffer), INT64_FORMAT "%s",
+ result, unit);
+ val = buffer;
+
+ }
+ break;
+
+ case GTMC_REAL:
+ {
+ struct config_real *conf = (struct config_real *) record;
+
+ snprintf(buffer, sizeof(buffer), "%g",
+ *conf->variable);
+ val = buffer;
+ }
+ break;
+
+ case GTMC_STRING:
+ {
+ struct config_string *conf = (struct config_string *) record;
+
+ if (*conf->variable && **conf->variable)
+ val = *conf->variable;
+ else
+ val = "";
+ }
+ break;
+
+ case GTMC_ENUM:
+ {
+ struct config_enum *conf = (struct config_enum *) record;
+
+ val = config_enum_lookup_by_value(conf, *conf->variable);
+ }
+ break;
+
+ default:
+ /* just to keep compiler quiet */
+ val = "???";
+ break;
+ }
+
+ return strdup(val);
+}
+
+
+
+/*
+ * A little "long argument" simulation, although not quite GNU
+ * compliant. Takes a string of the form "some-option=some value" and
+ * returns name = "some_option" and value = "some value" in malloc'ed
+ * storage. Note that '-' is converted to '_' in the option name. If
+ * there is no '=' in the input string then value will be NULL.
+ */
+void
+ParseLongOption(const char *string, char **name, char **value)
+{
+ size_t equal_pos;
+ char *cp;
+
+ AssertArg(string);
+ AssertArg(name);
+ AssertArg(value);
+
+ equal_pos = strcspn(string, "=");
+
+ if (string[equal_pos] == '=')
+ {
+ *name = gtm_opt_malloc(FATAL, equal_pos + 1);
+ strlcpy(*name, string, equal_pos + 1);
+
+ *value = gtm_opt_strdup(FATAL, &string[equal_pos + 1]);
+ }
+ else
+ {
+ /* no equal sign in string */
+ *name = gtm_opt_strdup(FATAL, string);
+ *value = NULL;
+ }
+
+ for (cp = *name; *cp; cp++)
+ if (*cp == '-')
+ *cp = '_';
+}
+
+#if 0
+/*
+ * keep-alive related APIs will be used in future extensions
+ */
+void
+gtm_assign_tcp_keepalives_idle(int newval, void *extra)
+{
+ /*
+ * The kernel API provides no way to test a value without setting it; and
+ * once we set it we might fail to unset it. So there seems little point
+ * in fully implementing the check-then-assign GTM API for these
+ * variables. Instead we just do the assignment on demand. pqcomm.c
+ * reports any problems via elog(LOG).
+ *
+ * This approach means that the GTM value might have little to do with the
+ * actual kernel value, so we use a show_hook that retrieves the kernel
+ * value rather than trusting GTM's copy.
+ */
+#if 0
+ (void) pq_setkeepalivesidle(newval, MyProcPort);
+#else
+ (void) pq_setkeepalivesidle_all(newval);
+#endif
+}
+
+const char *
+gtm_show_tcp_keepalives_idle(void)
+{
+ /* See comments in assign_tcp_keepalives_idle */
+ static char nbuf[16];
+
+#if 0
+ snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivesidle(MyProcPort));
+#else
+ snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivesidle_all());
+#endif
+ return nbuf;
+}
+
+void
+gtm_assign_tcp_keepalives_interval(int newval, void *extra)
+{
+ /* See comments in assign_tcp_keepalives_idle */
+#if 0
+ (void) pq_setkeepalivesinterval(newval, MyProcPort);
+#else
+ (void) pq_setkeepalivesinterval_all(newval);
+#endif
+}
+
+const char *
+gtm_show_tcp_keepalives_interval(void)
+{
+ /* See comments in assign_tcp_keepalives_idle */
+ static char nbuf[16];
+
+#if 0
+ snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivesinterval(MyProcPort));
+#else
+ snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivesinterval_all());
+#endif
+ return nbuf;
+}
+
+void
+gtm_assign_tcp_keepalives_count(int newval, void *extra)
+{
+ /* See comments in assign_tcp_keepalives_idle */
+#if 0
+ (void) pq_setkeepalivescount(newval, MyProcPort);
+#else
+ (void) pq_setkeepalivescount_all(newval);
+#endif
+}
+
+const char *
+gtm_show_tcp_keepalives_count(void)
+{
+ /* See comments in assign_tcp_keepalives_idle */
+ static char nbuf[16];
+
+#if 0
+ snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivescount(MyProcPort));
+#else
+ snprintf(nbuf, sizeof(nbuf), "%d", pq_getkeepalivescount_all());
+#endif
+ return nbuf;
+}
+#endif
+
+/*
+ * Try to interpret value as boolean value. Valid values are: true,
+ * false, yes, no, on, off, 1, 0; as well as unique prefixes thereof.
+ * If the string parses okay, return true, else false.
+ * If okay and result is not NULL, return the value in *result.
+ */
+static bool
+gtm_opt_parse_bool(const char *value, bool *result)
+{
+ return gtm_opt_parse_bool_with_len(value, strlen(value), result);
+}
+
+static bool
+gtm_opt_parse_bool_with_len(const char *value, size_t len, bool *result)
+{
+ switch (*value)
+ {
+ case 't':
+ case 'T':
+ if (pg_strncasecmp(value, "true", len) == 0)
+ {
+ if (result)
+ *result = true;
+ return true;
+ }
+ break;
+ case 'f':
+ case 'F':
+ if (pg_strncasecmp(value, "false", len) == 0)
+ {
+ if (result)
+ *result = false;
+ return true;
+ }
+ break;
+ case 'y':
+ case 'Y':
+ if (pg_strncasecmp(value, "yes", len) == 0)
+ {
+ if (result)
+ *result = true;
+ return true;
+ }
+ break;
+ case 'n':
+ case 'N':
+ if (pg_strncasecmp(value, "no", len) == 0)
+ {
+ if (result)
+ *result = false;
+ return true;
+ }
+ break;
+ case 'o':
+ case 'O':
+ /* 'o' is not unique enough */
+ if (pg_strncasecmp(value, "on", (len > 2 ? len : 2)) == 0)
+ {
+ if (result)
+ *result = true;
+ return true;
+ }
+ else if (pg_strncasecmp(value, "off", (len > 2 ? len : 2)) == 0)
+ {
+ if (result)
+ *result = false;
+ return true;
+ }
+ break;
+ case '1':
+ if (len == 1)
+ {
+ if (result)
+ *result = true;
+ return true;
+ }
+ break;
+ case '0':
+ if (len == 1)
+ {
+ if (result)
+ *result = false;
+ return true;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (result)
+ *result = false; /* suppress compiler warning */
+ return false;
+}
+
+/*
+ * ReportGUCOption: if appropriate, transmit option value to frontend
+ */
+static void
+ReportGTMOption(struct config_generic * record)
+{
+ /* So far, it is empty. */
+}
+
+/*
+ * Lookup the name for an enum option with the selected value.
+ * Should only ever be called with known-valid values, so throws
+ * an elog(ERROR) if the enum option is not found.
+ *
+ * The returned string is a pointer to static data and not
+ * allocated for modification.
+ */
+const char *
+config_enum_lookup_by_value(struct config_enum * record, int val)
+{
+ const struct config_enum_entry *entry;
+
+ for (entry = record->options; entry && entry->name; entry++)
+ {
+ if (entry->val == val)
+ return entry->name;
+ }
+
+ if (isStartUp)
+ write_stderr("could not find enum option %d for %s\n",
+ val, record->gen.name);
+ else
+ elog(ERROR, "could not find enum option %d for %s",
+ val, record->gen.name);
+ return NULL; /* silence compiler */
+}
diff --git a/src/gtm/common/gtm_opt_scanner.l b/src/gtm/common/gtm_opt_scanner.l
new file mode 100644
index 0000000000..f9be2cbfbe
--- /dev/null
+++ b/src/gtm/common/gtm_opt_scanner.l
@@ -0,0 +1,92 @@
+/* -*-pgsql-c-*- */
+/*
+ * Scanner for the configuration file
+ *
+ * Copyright (c) 2000-2011, PostgreSQL Global Development Group
+ *
+ * src/backend/utils/misc/guc-file.l
+ */
+
+%{
+
+#include "gtm/gtm.h"
+
+#include <ctype.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "mb/pg_wchar.h"
+#include "gtm/assert.h"
+#include "gtm/gtm_opt.h"
+#include "gtm/elog.h"
+
+
+/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
+#undef fprintf
+#define fprintf(file, fmt, msg) ereport(ERROR, (errmsg_internal("%s", msg)))
+
+enum {
+ GTMOPT_ID = 1,
+ GTMOPT_STRING = 2,
+ GTMOPT_INTEGER = 3,
+ GTMOPT_REAL = 4,
+ GTMOPT_EQUALS = 5,
+ GTMOPT_UNQUOTED_STRING = 6,
+ GTMOPT_QUALIFIED_ID = 7,
+ GTMOPT_EOL = 99,
+ GTMOPT_ERROR = 100
+};
+
+static unsigned int ConfigFileLineno;
+
+/* flex fails to supply a prototype for yylex, so provide one */
+int GTMOPT_yylex(void);
+
+%}
+
+%option 8bit
+%option never-interactive
+%option nodefault
+%option noinput
+%option nounput
+%option noyywrap
+%option prefix="GTMOPT_yy"
+
+
+SIGN ("-"|"+")
+DIGIT [0-9]
+HEXDIGIT [0-9a-fA-F]
+
+UNIT_LETTER [a-zA-Z]
+
+INTEGER {SIGN}?({DIGIT}+|0x{HEXDIGIT}+){UNIT_LETTER}*
+
+EXPONENT [Ee]{SIGN}?{DIGIT}+
+REAL {SIGN}?{DIGIT}*"."{DIGIT}*{EXPONENT}?
+
+LETTER [A-Za-z_\200-\377]
+LETTER_OR_DIGIT [A-Za-z_0-9\200-\377]
+
+ID {LETTER}{LETTER_OR_DIGIT}*
+QUALIFIED_ID {ID}"."{ID}
+
+UNQUOTED_STRING {LETTER}({LETTER_OR_DIGIT}|[-._:/])*
+STRING \'([^'\\\n]|\\.|\'\')*\'
+
+%%
+
+\n ConfigFileLineno++; return GTMOPT_EOL;
+[ \t\r]+ /* eat whitespace */
+#.* /* eat comment (.* matches anything until newline) */
+
+{ID} return GTMOPT_ID;
+{QUALIFIED_ID} return GTMOPT_QUALIFIED_ID;
+{STRING} return GTMOPT_STRING;
+{UNQUOTED_STRING} return GTMOPT_UNQUOTED_STRING;
+{INTEGER} return GTMOPT_INTEGER;
+{REAL} return GTMOPT_REAL;
+= return GTMOPT_EQUALS;
+
+. return GTMOPT_ERROR;
+
+%%
diff --git a/src/gtm/common/gtm_serialize.c b/src/gtm/common/gtm_serialize.c
index bb8e368d7c..9b870957b9 100644
--- a/src/gtm/common/gtm_serialize.c
+++ b/src/gtm/common/gtm_serialize.c
@@ -3,6 +3,11 @@
* gtm_serialize.c
* Serialization management of GTM data
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -701,6 +706,13 @@ gtm_get_pgxcnodeinfo_size(GTM_PGXCNodeInfo *data)
len += sizeof(GTM_PGXCNodeStatus); /* status */
+#ifdef XCP
+ len += sizeof(uint32); /* max_sessions */
+ len += sizeof(uint32); /* num_sessions */
+ if (data->num_sessions > 0) /* sessions */
+ len += (data->num_sessions * sizeof(GTM_PGXCSession));
+#endif
+
return len;
}
@@ -787,6 +799,21 @@ gtm_serialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, char *buf, size_t buflen)
memcpy(buf + len, &(data->status), sizeof(GTM_PGXCNodeStatus));
len += sizeof(GTM_PGXCNodeStatus);
+#ifdef XCP
+ /* GTM_PGXCNodeInfo.sessions */
+ len_wk = data->max_sessions;
+ memcpy(buf + len, &len_wk, sizeof(uint32));
+ len += sizeof(uint32);
+ len_wk = data->num_sessions;
+ memcpy(buf + len, &len_wk, sizeof(uint32));
+ len += sizeof(uint32);
+ if (len_wk > 0)
+ {
+ memcpy(buf + len, data->sessions, len_wk * sizeof(GTM_PGXCSession));
+ len += len_wk * sizeof(GTM_PGXCSession);
+ }
+#endif
+
/* NOTE: nothing to be done for node_lock */
return len;
}
@@ -795,25 +822,46 @@ gtm_serialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, char *buf, size_t buflen)
/*
* Return a deserialize number of PGXC node information
*/
+#ifdef XCP
+size_t
+gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buflen, PQExpBuffer *errorbuf)
+#else
size_t
gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buflen)
+#endif
{
size_t len = 0;
uint32 len_wk;
/* GTM_PGXCNodeInfo.type */
+#ifdef XCP
+ if (len + sizeof(GTM_PGXCNodeType) > buflen)
+ {
+ printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node info. buflen = %d", (int) buflen);
+ return (size_t) 0;
+ }
+#endif
memcpy(&(data->type), buf + len, sizeof(GTM_PGXCNodeType));
len += sizeof(GTM_PGXCNodeType);
/* GTM_PGXCNodeInfo.nodename*/
memcpy(&len_wk, buf + len, sizeof(uint32));
len += sizeof(uint32);
+
if (len_wk == 0)
{
data->nodename = NULL;
}
else
{
+#ifdef XCP
+ if (len + len_wk > buflen)
+ {
+ printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node name");
+ return (size_t) 0;
+ }
+#endif
+
/* PGXCTODO: free memory */
data->nodename = (char *)genAlloc(len_wk + 1);
memcpy(data->nodename, buf + len, (size_t)len_wk);
@@ -821,6 +869,7 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf
len += len_wk;
}
+
/* GTM_PGXCNodeInfo.proxyname*/
memcpy(&len_wk, buf + len, sizeof(uint32));
len += sizeof(uint32);
@@ -830,6 +879,13 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf
}
else
{
+#ifdef XCP
+ if (len + len_wk > buflen)
+ {
+ printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node info after proxy name");
+ return (size_t) 0;
+ }
+#endif
/* PGXCTODO: free memory */
data->proxyname = (char *)genAlloc(len_wk + 1);
memcpy(data->proxyname, buf + len, (size_t)len_wk);
@@ -838,6 +894,13 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf
}
/* GTM_PGXCNodeInfo.port */
+#ifdef XCP
+ if (len + sizeof(GTM_PGXCNodePort) > buflen)
+ {
+ printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node port");
+ return (size_t) 0;
+ }
+#endif
memcpy(&(data->port), buf + len, sizeof(GTM_PGXCNodePort));
len += sizeof(GTM_PGXCNodePort);
@@ -850,6 +913,13 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf
}
else
{
+#ifdef XCP
+ if (len + len_wk > buflen)
+ {
+ printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of ipaddress");
+ return (size_t) 0;
+ }
+#endif
data->ipaddress = (char *)genAlloc(len_wk + 1);
memcpy(data->ipaddress, buf + len, (size_t)len_wk);
data->ipaddress[len_wk] = 0; /* null_terminate */
@@ -865,6 +935,13 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf
}
else
{
+#ifdef XCP
+ if (len + len_wk > buflen)
+ {
+ printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node info after data folder");
+ return (size_t) 0;
+ }
+#endif
data->datafolder = (char *)genAlloc(len_wk + 1);
memcpy(data->datafolder, buf + len, (size_t)len_wk);
data->datafolder[len_wk] = 0; /* null_terminate */
@@ -872,9 +949,39 @@ gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, const char *buf, size_t buf
}
/* GTM_PGXCNodeInfo.status */
+#ifdef XCP
+ if (len + sizeof(GTM_PGXCNodeStatus) > buflen)
+ {
+ printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of node info after status");
+ return (size_t) 0;
+ }
+#endif
memcpy(&(data->status), buf + len, sizeof(GTM_PGXCNodeStatus));
len += sizeof(GTM_PGXCNodeStatus);
+#ifdef XCP
+ /* GTM_PGXCNodeInfo.sessions */
+ memcpy(&len_wk, buf + len, sizeof(uint32));
+ len += sizeof(uint32);
+ data->max_sessions = len_wk;
+ if (len_wk > 0)
+ data->sessions = (GTM_PGXCSession *)
+ genAlloc(len_wk * sizeof(GTM_PGXCSession));
+ memcpy(&len_wk, buf + len, sizeof(uint32));
+ len += sizeof(uint32);
+ data->num_sessions = len_wk;
+ if (len_wk > 0)
+ {
+ if (len + (data->num_sessions * sizeof(GTM_PGXCSession)) > buflen)
+ {
+ printfGTMPQExpBuffer(errorbuf, "Buffer length error in deserialization of session info");
+ return (size_t) 0;
+ }
+ memcpy(data->sessions, buf + len, len_wk * sizeof(GTM_PGXCSession));
+ len += len_wk * sizeof(GTM_PGXCSession);
+ }
+#endif
+
/* NOTE: nothing to be done for node_lock */
return len;
@@ -894,7 +1001,13 @@ gtm_get_sequence_size(GTM_SeqInfo *seq)
len += sizeof(GTM_SequenceKeyType); /* gs_key.gsk_type */
len += sizeof(GTM_Sequence); /* gs_value */
len += sizeof(GTM_Sequence); /* gs_init_value */
+#ifdef XCP
+ len += sizeof(uint32); /* gs_max_lastvals */
+ len += sizeof(uint32); /* gs_lastval_count */
+ len += seq->gs_lastval_count * sizeof(GTM_SeqLastVal); /* gs_last_values */
+#else
len += sizeof(GTM_Sequence); /* gs_last_value */
+#endif
len += sizeof(GTM_Sequence); /* gs_increment_by */
len += sizeof(GTM_Sequence); /* gs_min_value */
len += sizeof(GTM_Sequence); /* gs_max_value */
@@ -935,8 +1048,18 @@ gtm_serialize_sequence(GTM_SeqInfo *s, char *buf, size_t buflen)
memcpy(buf + len, &s->gs_init_value, sizeof(GTM_Sequence));
len += sizeof(GTM_Sequence); /* gs_init_value */
+#ifdef XCP
+ memcpy(buf + len, &s->gs_max_lastvals, sizeof(uint32));
+ len += sizeof(uint32); /* gs_max_lastvals */
+ memcpy(buf + len, &s->gs_lastval_count, sizeof(uint32));
+ len += sizeof(uint32); /* gs_lastval_count */
+ memcpy(buf + len, s->gs_last_values,
+ s->gs_lastval_count * sizeof(GTM_SeqLastVal));
+ len += s->gs_lastval_count * sizeof(GTM_SeqLastVal); /* gs_last_values */
+#else
memcpy(buf + len, &s->gs_last_value, sizeof(GTM_Sequence));
len += sizeof(GTM_Sequence); /* gs_last_value */
+#endif
memcpy(buf + len, &s->gs_increment_by, sizeof(GTM_Sequence));
len += sizeof(GTM_Sequence); /* gs_increment_by */
@@ -965,13 +1088,11 @@ gtm_serialize_sequence(GTM_SeqInfo *s, char *buf, size_t buflen)
/*
* Return number of deserialized sequence information
*/
-GTM_SeqInfo *
-gtm_deserialize_sequence(const char *buf, size_t buflen)
+size_t
+gtm_deserialize_sequence(GTM_SeqInfo *seq, const char *buf, size_t buflen)
{
size_t len = 0;
- GTM_SeqInfo *seq;
- seq = (GTM_SeqInfo *)genAlloc0(sizeof(GTM_SeqInfo));
seq->gs_key = (GTM_SequenceKeyData *)genAlloc0(sizeof(GTM_SequenceKeyData));
memcpy(&seq->gs_key->gsk_keylen, buf + len, sizeof(uint32));
@@ -990,8 +1111,24 @@ gtm_deserialize_sequence(const char *buf, size_t buflen)
memcpy(&seq->gs_init_value, buf + len, sizeof(GTM_Sequence));
len += sizeof(GTM_Sequence); /* gs_init_value */
+#ifdef XCP
+ memcpy(&seq->gs_max_lastvals, buf + len, sizeof(uint32));
+ len += sizeof(uint32); /* gs_max_lastvals */
+ if (seq->gs_max_lastvals > 0)
+ seq->gs_last_values = (GTM_SeqLastVal *)
+ genAlloc(seq->gs_max_lastvals * sizeof(GTM_SeqLastVal));
+ memcpy(&seq->gs_lastval_count, buf + len, sizeof(uint32));
+ len += sizeof(uint32); /* gs_lastval_count */
+ if (seq->gs_lastval_count > 0)
+ {
+ memcpy(seq->gs_last_values, buf + len,
+ seq->gs_lastval_count * sizeof(GTM_SeqLastVal));
+ len += seq->gs_lastval_count * sizeof(GTM_SeqLastVal); /* gs_last_values */
+ }
+#else
memcpy(&seq->gs_last_value, buf + len, sizeof(GTM_Sequence));
len += sizeof(GTM_Sequence); /* gs_last_value */
+#endif
memcpy(&seq->gs_increment_by, buf + len, sizeof(GTM_Sequence));
len += sizeof(GTM_Sequence); /* gs_increment_by */
@@ -1014,5 +1151,5 @@ gtm_deserialize_sequence(const char *buf, size_t buflen)
memcpy(&seq->gs_state, buf + len, sizeof(uint32));
len += sizeof(uint32);
- return seq;
+ return len;
}
diff --git a/src/gtm/common/gtm_utils.c b/src/gtm/common/gtm_utils.c
index ea6988640d..081660ec57 100644
--- a/src/gtm/common/gtm_utils.c
+++ b/src/gtm/common/gtm_utils.c
@@ -71,6 +71,7 @@ static struct enum_name message_name_tab[] =
{MSG_SNAPSHOT_GXID_GET, "MSG_SNAPSHOT_GXID_GET"},
{MSG_SEQUENCE_INIT, "MSG_SEQUENCE_INIT"},
{MSG_BKUP_SEQUENCE_INIT, "MSG_BKUP_SEQUENCE_INIT"},
+ {MSG_SEQUENCE_GET_CURRENT, "MSG_SEQUENCE_GET_CURRENT"},
{MSG_SEQUENCE_GET_NEXT, "MSG_SEQUENCE_GET_NEXT"},
{MSG_BKUP_SEQUENCE_GET_NEXT, "MSG_BKUP_SEQUENCE_GET_NEXT"},
{MSG_SEQUENCE_GET_LAST, "MSG_SEQUENCE_GET_LAST"},
@@ -124,6 +125,7 @@ static struct enum_name result_name_tab[] =
{SNAPSHOT_GET_MULTI_RESULT, "SNAPSHOT_GET_MULTI_RESULT"},
{SNAPSHOT_GXID_GET_RESULT, "SNAPSHOT_GXID_GET_RESULT"},
{SEQUENCE_INIT_RESULT, "SEQUENCE_INIT_RESULT"},
+ {SEQUENCE_GET_CURRENT_RESULT, "SEQUENCE_GET_CURRENT_RESULT"},
{SEQUENCE_GET_NEXT_RESULT, "SEQUENCE_GET_NEXT_RESULT"},
{SEQUENCE_GET_LAST_RESULT, "SEQUENCE_GET_LAST_RESULT"},
{SEQUENCE_SET_VAL_RESULT, "SEQUENCE_SET_VAL_RESULT"},
diff --git a/src/gtm/gtm_ctl/.gitignore b/src/gtm/gtm_ctl/.gitignore
new file mode 100644
index 0000000000..ffe90d63fc
--- /dev/null
+++ b/src/gtm/gtm_ctl/.gitignore
@@ -0,0 +1 @@
+/gtm_ctl
diff --git a/src/gtm/gtm_ctl/Makefile b/src/gtm/gtm_ctl/Makefile
new file mode 100644
index 0000000000..6b079b7832
--- /dev/null
+++ b/src/gtm/gtm_ctl/Makefile
@@ -0,0 +1,34 @@
+#----------------------------------------------------------------------------
+#
+# Postgres-XC GTM gtm_ctl makefile
+#
+# Copyright(c) 2010-2012 Postgres-XC Development Group
+#
+# src/gtm/gtm_ctl/Makefile
+#
+#-----------------------------------------------------------------------------
+top_builddir=../../..
+include $(top_builddir)/src/Makefile.global
+subdir=src/gtm/gtm_ctl
+
+OBJS=gtm_ctl.o
+
+OTHERS=../common/libgtm.a ../libpq/libpqcomm.a ../client/libgtmclient.a ../path/libgtmpath.a
+
+LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq
+
+
+LIBS=-lpthread
+
+gtm_ctl:$(OBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ $(OTHERS) -o gtm_ctl
+
+all:gtm_ctl
+
+clean:
+ rm -f $(OBJS)
+ rm -f gtm_ctl
+
+distclean: clean
+
+maintainer-clean: distclean
diff --git a/src/gtm/gtm_ctl/gtm_ctl.c b/src/gtm/gtm_ctl/gtm_ctl.c
new file mode 100644
index 0000000000..29c78c8d00
--- /dev/null
+++ b/src/gtm/gtm_ctl/gtm_ctl.c
@@ -0,0 +1,1317 @@
+/*-------------------------------------------------------------------------
+ *
+ * gtm_ctl --- start/stops/restarts the GTM server/proxy
+ *
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "gtm/gtm_c.h"
+#include "gtm/libpq-fe.h"
+
+#include <locale.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+#include "libpq/pqsignal.h"
+
+/* PID can be negative for standalone backend */
+typedef long pgpid_t;
+
+typedef enum
+{
+ SMART_MODE,
+ FAST_MODE,
+ IMMEDIATE_MODE
+} ShutdownMode;
+
+
+typedef enum
+{
+ NO_COMMAND = 0,
+ START_COMMAND,
+ STOP_COMMAND,
+ PROMOTE_COMMAND,
+ RESTART_COMMAND,
+ STATUS_COMMAND,
+ RECONNECT_COMMAND
+} CtlCommand;
+
+#define DEFAULT_WAIT 60
+
+static bool do_wait = false;
+static bool wait_set = false;
+static int wait_seconds = DEFAULT_WAIT;
+static bool silent_mode = false;
+static ShutdownMode shutdown_mode = SMART_MODE;
+static int sig = SIGTERM; /* default */
+static CtlCommand ctl_command = NO_COMMAND;
+static char *gtm_data = NULL;
+static char *gtmdata_opt = NULL;
+static char *gtm_opts = NULL;
+static const char *progname;
+static char *log_file = NULL;
+static char *gtm_path = NULL;
+static char *gtm_app = NULL;
+static char *argv0 = NULL;
+
+static void
+write_stderr(const char *fmt,...)
+/* This extension allows gcc to check the format string for consistency with
+ the supplied arguments. */
+__attribute__((format(printf, 1, 2)));
+static void *pg_malloc(size_t size);
+static char *xstrdup(const char *s);
+static void do_advice(void);
+static void do_help(void);
+static void set_mode(char *modeopt);
+static void do_start(void);
+static void do_stop(void);
+static void do_restart(void);
+static void do_reconnect(void);
+static void print_msg(const char *msg);
+
+static pgpid_t get_pgpid(void);
+static char **readfile(const char *path);
+static int start_gtm(void);
+static void read_gtm_opts(void);
+
+static bool test_gtm_connection();
+static bool gtm_is_alive(pid_t pid);
+
+static void *pg_realloc(void *ptr, size_t size);
+static int RunAsDaemon(char *cmd);
+
+static char gtmopts_file[MAXPGPATH];
+static char pid_file[MAXPGPATH];
+static char conf_file[MAXPGPATH];
+
+/*
+ * Write errors to stderr (or by gtm_equal means when stderr is
+ * not available).
+ */
+static void
+write_stderr(const char *fmt,...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+
+ /* On Unix, we just fprintf to stderr */
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * routines to check memory allocations and fail noisily.
+ */
+
+static void *
+pg_malloc(size_t size)
+{
+ void *result;
+
+ result = malloc(size);
+ if (!result)
+ {
+ write_stderr(_("%s: out of memory\n"), progname);
+ exit(1);
+ }
+ return result;
+}
+
+
+static char *
+xstrdup(const char *s)
+{
+ char *result;
+
+ result = strdup(s);
+ if (!result)
+ {
+ write_stderr(_("%s: out of memory\n"), progname);
+ exit(1);
+ }
+ return result;
+}
+
+/*
+ * Given an already-localized string, print it to stdout unless the
+ * user has specified that no messages should be printed.
+ */
+static void
+print_msg(const char *msg)
+{
+ if (!silent_mode)
+ {
+ fputs(msg, stdout);
+ fflush(stdout);
+ }
+}
+
+static pgpid_t
+get_pgpid(void)
+{
+ FILE *pidf;
+ long pid;
+
+ pidf = fopen(pid_file, "r");
+ if (pidf == NULL)
+ {
+ /* No pid file, not an error on startup */
+ if (errno == ENOENT)
+ return 0;
+ else
+ {
+ write_stderr(_("%s: could not open PID file \"%s\": %s\n"),
+ progname, pid_file, strerror(errno));
+ exit(1);
+ }
+ }
+ if (fscanf(pidf, "%ld", &pid) != 1)
+ {
+ write_stderr(_("%s: invalid data in PID file \"%s\"\n"),
+ progname, pid_file);
+ exit(1);
+ }
+ fclose(pidf);
+ return (pgpid_t) pid;
+}
+
+
+/*
+ * get the lines from a text file - return NULL if file can't be opened
+ */
+static char **
+readfile(const char *path)
+{
+ FILE *infile;
+ int maxlength = 0,
+ linelen = 0;
+ int nlines = 0;
+ char **result;
+ char *buffer;
+ int c;
+
+ if ((infile = fopen(path, "r")) == NULL)
+ return NULL;
+
+ /* pass over the file twice - the first time to size the result */
+
+ while ((c = fgetc(infile)) != EOF)
+ {
+ linelen++;
+ if (c == '\n')
+ {
+ nlines++;
+ if (linelen > maxlength)
+ maxlength = linelen;
+ linelen = 0;
+ }
+ }
+
+ /* handle last line without a terminating newline (yuck) */
+ if (linelen)
+ nlines++;
+ if (linelen > maxlength)
+ maxlength = linelen;
+
+ /* set up the result and the line buffer */
+ result = (char **) pg_malloc((nlines + 1) * sizeof(char *));
+ buffer = (char *) pg_malloc(maxlength + 1);
+
+ /* now reprocess the file and store the lines */
+ rewind(infile);
+ nlines = 0;
+ while (fgets(buffer, maxlength + 1, infile) != NULL)
+ result[nlines++] = xstrdup(buffer);
+
+ fclose(infile);
+ free(buffer);
+ result[nlines] = NULL;
+
+ return result;
+}
+
+
+
+/*
+ * start/test/stop routines
+ */
+
+static int
+start_gtm(void)
+{
+ char cmd[MAXPGPATH];
+ char gtm_app_path[MAXPGPATH];
+ int len;
+
+ /*
+ * Since there might be quotes to handle here, it is easier simply to pass
+ * everything to a shell to process them.
+ */
+
+ memset(gtm_app_path, 0, MAXPGPATH);
+ memset(cmd, 0, MAXPGPATH);
+
+ /*
+ * Build gtm binary path. We should leave one byte at the end for '\0'
+ */
+ len = 0;
+ if (gtm_path != NULL)
+ {
+ strncpy(gtm_app_path, gtm_path, MAXPGPATH - len - 1);
+
+ len = strlen(gtm_app_path);
+ strncat(gtm_app_path, "/", MAXPGPATH - len - 1);
+
+ len = strlen(gtm_app_path);
+ }
+
+ if (strlen(gtm_app) >= (MAXPGPATH - len - 1))
+ {
+ write_stderr("gtm command exceeds max size");
+ exit(1);
+ }
+
+ strncat(gtm_app_path, gtm_app, MAXPGPATH - len - 1);
+
+ if (log_file != NULL)
+ len = snprintf(cmd, MAXPGPATH - 1, SYSTEMQUOTE "\"%s\" %s%s -l %s &" SYSTEMQUOTE,
+ gtm_app_path, gtmdata_opt, gtm_opts, log_file);
+ else
+ len = snprintf(cmd, MAXPGPATH - 1, SYSTEMQUOTE "\"%s\" %s%s < \"%s\" 2>&1 &" SYSTEMQUOTE,
+ gtm_app_path, gtmdata_opt, gtm_opts, DEVNULL);
+
+ if (len >= MAXPGPATH - 1)
+ {
+ write_stderr("gtm command exceeds max size");
+ exit(1);
+ }
+
+ if (log_file)
+ return (RunAsDaemon(cmd));
+ else
+ return system(cmd);
+}
+
+/*
+ * Run specified command as a daemon.
+ * Assume that *cmd includes '&' to run
+ * the command at background so that we need fork()
+ * only once.
+ */
+static int RunAsDaemon(char *cmd)
+{
+ switch (fork())
+ {
+ int status;
+
+ case 0:
+ /*
+ * Using fileno(xxx) may encounter trivial error because xxx may
+ * have been closed at somewhere else and fileno() may fail.
+ * Its safer to use literal file descriptor here.
+ */
+ close(0);
+ close(1);
+ close(2);
+ if ((status = system(cmd)) == -1)
+ /*
+ * Same behavior as /bin/sh could not be
+ * executed.
+ */
+ exit(127);
+ else
+ exit(WEXITSTATUS(status));
+ break;
+ case -1:
+ return -1;
+ default:
+ return 0;
+ break;
+ }
+}
+
+
+/*
+ * Find the gtm port and try a connection
+ */
+static bool
+test_gtm_connection()
+{
+ GTM_Conn *conn;
+ bool success = false;
+ int i;
+ char portstr[32];
+ char *p;
+ char *q;
+ char connstr[128]; /* Should be way more than enough! */
+
+ *portstr = '\0';
+
+ /*
+ * Look in gtm_opts for a -p switch.
+ *
+ * This parsing code is not amazingly bright; it could for instance
+ * get fooled if ' -p' occurs within a quoted argument value. Given
+ * that few people pass complicated settings in gtm_opts, it's
+ * probably good enough.
+ */
+ for (p = gtm_opts; *p;)
+ {
+ /* advance past whitespace */
+ while (isspace((unsigned char) *p))
+ p++;
+
+ if (strncmp(p, "-p", 2) == 0)
+ {
+ p += 2;
+ /* advance past any whitespace/quoting */
+ while (isspace((unsigned char) *p) || *p == '\'' || *p == '"')
+ p++;
+ /* find end of value (not including any ending quote!) */
+ q = p;
+ while (*q &&
+ !(isspace((unsigned char) *q) || *q == '\'' || *q == '"'))
+ q++;
+ /* and save the argument value */
+ strlcpy(portstr, p, Min((q - p) + 1, sizeof(portstr)));
+ /* keep looking, maybe there is another -p */
+ p = q;
+ }
+ /* Advance to next whitespace */
+ while (*p && !isspace((unsigned char) *p))
+ p++;
+ }
+
+ /*
+ * Search config file for a 'port' option.
+ *
+ * This parsing code isn't amazingly bright either, but it should be okay
+ * for valid port settings.
+ */
+ if (!*portstr)
+ {
+ char **optlines;
+
+ optlines = readfile(conf_file);
+ if (optlines != NULL)
+ {
+ for (; *optlines != NULL; optlines++)
+ {
+ p = *optlines;
+
+ while (isspace((unsigned char) *p))
+ p++;
+ if (strncmp(p, "port", 4) != 0)
+ continue;
+ p += 4;
+ while (isspace((unsigned char) *p))
+ p++;
+ if (*p != '=')
+ continue;
+ p++;
+ /* advance past any whitespace/quoting */
+ while (isspace((unsigned char) *p) || *p == '\'' || *p == '"')
+ p++;
+ /* find end of value (not including any ending quote/comment!) */
+ q = p;
+ while (*q &&
+ !(isspace((unsigned char) *q) ||
+ *q == '\'' || *q == '"' || *q == '#'))
+ q++;
+ /* and save the argument value */
+ strlcpy(portstr, p, Min((q - p) + 1, sizeof(portstr)));
+ /* keep looking, maybe there is another */
+ }
+ }
+ }
+
+ /* Still not found? Use compiled-in default */
+#define GTM_DEFAULT_PORT 6666
+ if (!*portstr)
+ snprintf(portstr, sizeof(portstr), "%d", GTM_DEFAULT_PORT);
+
+ /*
+ * We need to set a connect timeout otherwise on Windows the SCM will
+ * probably timeout first
+ * a PGXC node ID has to be set for GTM connection protocol,
+ * so its value doesn't really matter here.
+ */
+ snprintf(connstr, sizeof(connstr),
+ "host=localhost port=%s connect_timeout=5 node_name=one", portstr);
+
+ for (i = 0; i < wait_seconds; i++)
+ {
+ if ((conn = PQconnectGTM(connstr)) != NULL &&
+ (GTMPQstatus(conn) == CONNECTION_OK))
+ {
+ GTMPQfinish(conn);
+ success = true;
+ break;
+ }
+ else
+ {
+ GTMPQfinish(conn);
+ print_msg(".");
+ sleep(1); /* 1 sec */
+ }
+ }
+
+ return success;
+}
+
+static void
+read_gtm_opts(void)
+{
+ if (gtm_opts == NULL)
+ {
+ gtm_opts = ""; /* default */
+ if (ctl_command == RESTART_COMMAND)
+ {
+ char **optlines;
+
+ optlines = readfile(gtmopts_file);
+ if (optlines == NULL)
+ {
+ write_stderr(_("%s: could not read file \"%s\"\n"), progname, gtmopts_file);
+ exit(1);
+ }
+ else if (optlines[0] == NULL || optlines[1] != NULL)
+ {
+ write_stderr(_("%s: option file \"%s\" must have exactly one line\n"),
+ progname, gtmopts_file);
+ exit(1);
+ }
+ else
+ {
+ int len;
+ char *optline;
+
+ optline = optlines[0];
+ /* trim off line endings */
+ len = strcspn(optline, "\r\n");
+ optline[len] = '\0';
+
+ gtm_opts = optline;
+ }
+ }
+ }
+}
+
+static void
+do_start(void)
+{
+ pgpid_t pid;
+ pgpid_t old_pid = 0;
+ int exitcode;
+
+ if (ctl_command != RESTART_COMMAND)
+ {
+ old_pid = get_pgpid();
+ if (old_pid != 0)
+ write_stderr(_("%s: another server might be running; "
+ "trying to start server anyway\n"),
+ progname);
+ }
+
+ read_gtm_opts();
+
+ exitcode = start_gtm();
+ if (exitcode != 0)
+ {
+ write_stderr(_("%s: could not start server: exit code was %d\n"),
+ progname, exitcode);
+ exit(1);
+ }
+
+ if (old_pid != 0)
+ {
+ sleep(1);
+ pid = get_pgpid();
+ if (pid == old_pid)
+ {
+ write_stderr(_("%s: could not start server\n"
+ "Examine the log output.\n"),
+ progname);
+ exit(1);
+ }
+ }
+
+ if (do_wait)
+ {
+ print_msg(_("waiting for server to start..."));
+
+ if (test_gtm_connection() == false)
+ {
+ printf(_("could not start server\n"));
+ exit(1);
+ }
+ else
+ {
+ print_msg(_(" done\n"));
+ print_msg(_("server started\n"));
+ }
+ }
+ else
+ print_msg(_("server starting\n"));
+}
+
+
+static void
+do_stop(void)
+{
+ int cnt;
+ pgpid_t pid;
+
+ pid = get_pgpid();
+
+ if (pid == 0) /* no pid file */
+ {
+ write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file);
+ write_stderr(_("Is server running?\n"));
+ exit(1);
+ }
+ else if (pid < 0) /* standalone backend, not gtm */
+ {
+ pid = -pid;
+ write_stderr(_("%s: cannot stop server; "
+ "single-user server is running (PID: %ld)\n"),
+ progname, pid);
+ exit(1);
+ }
+
+ if (kill((pid_t) pid, sig) != 0)
+ {
+ write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid,
+ strerror(errno));
+ exit(1);
+ }
+
+ if (!do_wait)
+ {
+ print_msg(_("server shutting down\n"));
+ return;
+ }
+ else
+ {
+ print_msg(_("waiting for server to shut down..."));
+
+ for (cnt = 0; cnt < wait_seconds; cnt++)
+ {
+ if ((pid = get_pgpid()) != 0)
+ {
+ print_msg(".");
+ sleep(1); /* 1 sec */
+ }
+ else
+ break;
+ }
+
+ if (pid != 0) /* pid file still exists */
+ {
+ print_msg(_(" failed\n"));
+
+ write_stderr(_("%s: server does not shut down\n"), progname);
+ exit(1);
+ }
+ print_msg(_(" done\n"));
+
+ printf(_("server stopped\n"));
+ }
+}
+
+static void
+do_promote(void)
+{
+ pgpid_t pid;
+
+ pid = get_pgpid();
+
+ if (pid == 0) /* no pid file */
+ {
+ write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file);
+ write_stderr(_("Is server running?\n"));
+ exit(1);
+ }
+ else if (pid < 0) /* standalone backend, not gtm */
+ {
+ pid = -pid;
+ write_stderr(_("%s: cannot promote server; "
+ "single-user server is running (PID: %ld)\n"),
+ progname, pid);
+ exit(1);
+ }
+
+ if (kill((pid_t) pid, SIGUSR1) != 0)
+ {
+ write_stderr(_("%s: could not send promote signal (PID: %ld): %s\n"), progname, pid,
+ strerror(errno));
+ exit(1);
+ }
+}
+
+/*
+ * At least we expect the following argument
+ *
+ * 1) -D datadir
+ * 2) -o options: we expect that -t and -s options are specified here.
+ * Check will be done in GTM-Proxy. If there's an error, it will be
+ * logged. In this case, GTM-Proxy won't terminate. It will continue
+ * to read/write with old GTM.
+ *
+ * Because they are not passed to gtm directly, they should appear in
+ * gtm_ctl argument, not in -o options. They're specific to gtm_ctl
+ * reconnect.
+ */
+static void
+do_reconnect(void)
+{
+ pgpid_t pid;
+ char *reconnect_point_file_nam;
+ FILE *reconnect_point_file;
+
+#ifdef GTM_SBY_DEBUG
+ write_stderr("Reconnecting to new GTM ... DEBUG MODE.");
+#endif
+
+ /*
+ * Target must be "gtm_proxy"
+ */
+ if (strcmp(gtm_app, "gtm_proxy") != 0)
+ {
+ write_stderr(_("%s: only gtm_proxy can accept reconnect command\n"), progname);
+ exit(1);
+ }
+ pid = get_pgpid();
+
+ if (pid == 0) /* no pid file */
+ {
+ write_stderr(_("%s: PID file \"%s\" does not exist\n"), progname, pid_file);
+ write_stderr(_("Is server running?\n"));
+ exit(1);
+ }
+ else if (pid < 0) /* standalone backend, not gtm */
+ {
+ pid = -pid;
+ write_stderr(_("%s: cannot promote server; "
+ "single-user server is running (PID: %ld)\n"),
+ progname, pid);
+ exit(1);
+ }
+ read_gtm_opts();
+ /*
+ * Pass reconnect info to GTM-Proxy.
+ *
+ * Option arguments are written to new gtm file under -D directory.
+ */
+ reconnect_point_file_nam = malloc(strlen(gtm_data) + 9);
+ if (reconnect_point_file_nam == NULL)
+ {
+ write_stderr(_("%s: No memory available.\n"), progname);
+ exit(1);
+ }
+
+ snprintf(reconnect_point_file_nam, strlen(gtm_data) + 8, "%s/newgtm", gtm_data);
+ reconnect_point_file = fopen(reconnect_point_file_nam, "w");
+
+ if (reconnect_point_file == NULL)
+ {
+ write_stderr(_("%s: Cannot open reconnect point file %s\n"), progname, reconnect_point_file_nam);
+ exit(1);
+ }
+
+ fprintf(reconnect_point_file, "%s\n", gtm_opts);
+ fclose(reconnect_point_file);
+ free(reconnect_point_file_nam);
+
+ if (kill((pid_t) pid, SIGUSR1) != 0)
+ {
+ write_stderr(_("%s: could not send promote signal (PID: %ld): %s\n"), progname, pid,
+ strerror(errno));
+ exit(1);
+ }
+}
+
+
+/*
+ * restart/reload routines
+ */
+
+static void
+do_restart(void)
+{
+ int cnt;
+ pgpid_t pid;
+
+ pid = get_pgpid();
+
+ if (pid == 0) /* no pid file */
+ {
+ write_stderr(_("%s: PID file \"%s\" does not exist\n"),
+ progname, pid_file);
+ write_stderr(_("Is server running?\n"));
+ write_stderr(_("starting server anyway\n"));
+ do_start();
+ return;
+ }
+ else if (pid < 0) /* standalone backend, not gtm */
+ {
+ pid = -pid;
+ if (gtm_is_alive((pid_t) pid))
+ {
+ write_stderr(_("%s: cannot restart server; "
+ "single-user server is running (PID: %ld)\n"),
+ progname, pid);
+ write_stderr(_("Please terminate the single-user server and try again.\n"));
+ exit(1);
+ }
+ }
+
+ if (gtm_is_alive((pid_t) pid))
+ {
+ if (kill((pid_t) pid, sig) != 0)
+ {
+ write_stderr(_("%s: could not send stop signal (PID: %ld): %s\n"), progname, pid,
+ strerror(errno));
+ exit(1);
+ }
+
+ print_msg(_("waiting for server to shut down..."));
+
+ /* always wait for restart */
+
+ for (cnt = 0; cnt < wait_seconds; cnt++)
+ {
+ if ((pid = get_pgpid()) != 0)
+ {
+ print_msg(".");
+ sleep(1); /* 1 sec */
+ }
+ else
+ break;
+ }
+
+ if (pid != 0) /* pid file still exists */
+ {
+ print_msg(_(" failed\n"));
+
+ write_stderr(_("%s: server does not shut down\n"), progname);
+ exit(1);
+ }
+
+ print_msg(_(" done\n"));
+ printf(_("server stopped\n"));
+ }
+ else
+ {
+ write_stderr(_("%s: old server process (PID: %ld) seems to be gone\n"),
+ progname, pid);
+ write_stderr(_("starting server anyway\n"));
+ }
+
+ do_start();
+}
+
+
+static void
+do_status(void)
+{
+ pgpid_t pid;
+ char datpath[MAXPGPATH];
+ int mode;
+ FILE *pidf;
+
+ /*
+ * Read a PID file to get GTM server status instead of attaching shared memory.
+ */
+ pidf = fopen(pid_file, "r");
+ if (pidf == NULL)
+ {
+ write_stderr(_("%s: could not open PID file \"%s\": %s\n"),
+ progname, pid_file, strerror(errno));
+ exit(1);
+ }
+
+ if (fscanf(pidf, "%ld", &pid) != 1)
+ {
+ write_stderr(_("%s: invalid data in PID file \"%s\"\n"),
+ progname, pid_file);
+ exit(1);
+ }
+
+ if (fscanf(pidf, "%s", datpath) != 1)
+ {
+ write_stderr(_("%s: invalid data in PID file \"%s\"\n"),
+ progname, pid_file);
+ exit(1);
+ }
+
+ if (fscanf(pidf, "%d", &mode) != 1)
+ {
+ write_stderr(_("%s: invalid data in PID file \"%s\"\n"),
+ progname, pid_file);
+ exit(1);
+ }
+
+ fclose(pidf);
+
+ pid = get_pgpid();
+
+ if (pid == 0) /* no pid file */
+ {
+ write_stderr(_("%s: PID file \"%s\" does not exist\n"),
+ progname, pid_file);
+ write_stderr(_("Is server running?\n"));
+ exit(1);
+ }
+ else if (pid < 0) /* standalone backend, not gtm */
+ {
+ pid = -pid;
+ if (gtm_is_alive((pid_t) pid))
+ {
+ write_stderr(_("%s: cannot get server status; "
+ "single-user server is running (PID: %ld)\n"),
+ progname, pid);
+ write_stderr(_("Please terminate the single-user server and try again.\n"));
+ exit(1);
+ }
+ }
+ else
+ {
+ if (gtm_is_alive((pid_t) pid))
+ {
+ char **optlines;
+
+ printf(_("%s: server is running (PID: %ld)\n"),
+ progname, pid);
+
+ optlines = readfile(gtmopts_file);
+ if (optlines != NULL)
+ for (; *optlines != NULL; optlines++)
+ fputs(*optlines, stdout);
+ return;
+ }
+ }
+
+ write_stderr(_("%s: no server running\n"), progname);
+ exit(1);
+}
+
+
+/*
+ * utility routines
+ */
+
+static bool
+gtm_is_alive(pid_t pid)
+{
+ /*
+ * Test to see if the process is still there. Note that we do not
+ * consider an EPERM failure to mean that the process is still there;
+ * EPERM must mean that the given PID belongs to some other userid, and
+ * considering the permissions on $GTMDATA, that means it's not the
+ * gtm we are after.
+ *
+ * Don't believe that our own PID or parent shell's PID is the gtm,
+ * either. (Windows hasn't got getppid(), though.)
+ */
+ if (pid == getpid())
+ return false;
+#ifndef WIN32
+ if (pid == getppid())
+ return false;
+#endif
+ if (kill(pid, 0) == 0)
+ return true;
+ return false;
+}
+
+static void
+do_advice(void)
+{
+ write_stderr(_("Try \"%s --help\" for more information.\n"), progname);
+}
+
+
+static void
+do_help(void)
+{
+ printf(_("%s is a utility to start, stop or restart,\n"
+ "a GTM server, a GTM standby or GTM proxy.\n\n"), progname);
+ printf(_("Usage:\n"));
+ printf(_(" %s start -Z STARTUP_MODE [-w] [-t SECS] [-D DATADIR] [-l FILENAME] [-o \"OPTIONS\"]\n"), progname);
+ printf(_(" %s stop -Z STARTUP_MODE [-W] [-t SECS] [-D DATADIR] [-m SHUTDOWN-MODE]\n"), progname);
+ printf(_(" %s promote -Z STARTUP_MODE [-w] [-t SECS] [-D DATADIR]\n"), progname);
+ printf(_(" %s restart -Z STARTUP_MODE [-w] [-t SECS] [-D DATADIR] [-m SHUTDOWN-MODE]\n"
+ " [-o \"OPTIONS\"]\n"), progname);
+ printf(_(" %s status -Z STARTUP_MODE [-w] [-t SECS] [-D DATADIR]\n"), progname);
+ printf(_(" %s reconnect -Z STARTUP_MODE [-D DATADIR] -o \"OPTIONS\"]\n"), progname);
+
+ printf(_("\nCommon options:\n"));
+ printf(_(" -D DATADIR location of the database storage area\n"));
+ printf(_(" -i nodename set gtm_proxy nodename registered on GTM\n"));
+ printf(_(" (option ignored if used with GTM)\n"));
+ printf(_(" -t SECS seconds to wait when using -w option\n"));
+ printf(_(" -w wait until operation completes\n"));
+ printf(_(" -W do not wait until operation completes\n"));
+ printf(_(" --help show this help, then exit\n"));
+ printf(_("(The default is to wait for shutdown, but not for start or restart.)\n\n"));
+
+ printf(_("\nOptions for start or restart:\n"));
+ printf(_(" -l FILENAME write (or append) server log to FILENAME\n"));
+ printf(_(" -o OPTIONS command line options to pass to gtm\n"
+ " (GTM server executable)\n"));
+ printf(_(" -p PATH-TO-GTM/PROXY path to gtm/gtm_proxy executables\n"));
+ printf(_(" -Z STARTUP-MODE can be \"gtm\", \"gtm_standby\" or \"gtm_proxy\"\n"));
+ printf(_("\nOptions for stop or restart:\n"));
+ printf(_(" -m SHUTDOWN-MODE can be \"smart\", \"fast\", or \"immediate\"\n"));
+
+ printf(_("\nOptions for reconnect:\n"));
+ printf(_(" -t NewGTMPORT Port number of new GTM.\n"));
+ printf(_(" -s NewGTMHost Host Name of new GTM.\n"));
+
+ printf(_("\nShutdown modes are:\n"));
+ printf(_(" smart quit after all clients have disconnected\n"));
+ printf(_(" fast quit directly, with proper shutdown\n"));
+ printf(_(" immediate quit without complete shutdown; will lead to recovery on restart\n"));
+}
+
+
+static void
+set_mode(char *modeopt)
+{
+ if (strcmp(modeopt, "s") == 0 || strcmp(modeopt, "smart") == 0)
+ {
+ shutdown_mode = SMART_MODE;
+ sig = SIGTERM;
+ }
+ else if (strcmp(modeopt, "f") == 0 || strcmp(modeopt, "fast") == 0)
+ {
+ shutdown_mode = FAST_MODE;
+ sig = SIGINT;
+ }
+ else if (strcmp(modeopt, "i") == 0 || strcmp(modeopt, "immediate") == 0)
+ {
+ shutdown_mode = IMMEDIATE_MODE;
+ sig = SIGQUIT;
+ }
+ else
+ {
+ write_stderr(_("%s: unrecognized shutdown mode \"%s\"\n"), progname, modeopt);
+ do_advice();
+ exit(1);
+ }
+}
+
+int
+main(int argc, char **argv)
+{
+ int c;
+ char *nodename = NULL; /* GTM Proxy nodename */
+
+ progname = "gtm_ctl";
+
+ /*
+ * save argv[0] so do_start() can look for the gtm if necessary. we
+ * don't look for gtm here because in many cases we won't need it.
+ */
+ argv0 = argv[0];
+
+ umask(077);
+
+ /* support --help and --version even if invoked as root */
+ if (argc > 1)
+ {
+ if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 ||
+ strcmp(argv[1], "-?") == 0)
+ {
+ do_help();
+ exit(0);
+ }
+ }
+
+ /*
+ * Disallow running as root, to forestall any possible security holes.
+ */
+ if (geteuid() == 0)
+ {
+ write_stderr(_("%s: cannot be run as root\n"
+ "Please log in (using, e.g., \"su\") as the "
+ "(unprivileged) user that will\n"
+ "own the server process.\n"),
+ progname);
+ exit(1);
+ }
+
+ /*
+ * 'Action' can be before or after args so loop over both. Some
+ * getopt_long() implementations will reorder argv[] to place all flags
+ * first (GNU?), but we don't rely on it. Our /port version doesn't do
+ * that.
+ */
+ optind = 1;
+
+ /* process command-line options */
+ while (optind < argc)
+ {
+ while ((c = getopt(argc, argv, "D:i:l:m:o:p:t:wWZ:")) != -1)
+ {
+ switch (c)
+ {
+ case 'D':
+ {
+ char *gtmdata_D;
+ char *env_var = pg_malloc(strlen(optarg) + 9);
+
+ gtmdata_D = xstrdup(optarg);
+ canonicalize_path(gtmdata_D);
+ snprintf(env_var, strlen(optarg) + 9, "GTMDATA=%s",
+ gtmdata_D);
+ putenv(env_var);
+
+ /*
+ * We could pass GTMDATA just in an environment
+ * variable but we do -D too for clearer gtm
+ * 'ps' display
+ */
+ gtmdata_opt = (char *) pg_malloc(strlen(gtmdata_D) + 8);
+ snprintf(gtmdata_opt, strlen(gtmdata_D) + 8,
+ "-D \"%s\" ",
+ gtmdata_D);
+ break;
+ }
+ case 'i':
+ nodename = strdup(optarg);
+ break;
+ case 'l':
+ log_file = xstrdup(optarg);
+ break;
+ case 'm':
+ set_mode(optarg);
+ break;
+ case 'o':
+ gtm_opts = xstrdup(optarg);
+ break;
+ case 'p':
+ gtm_path = xstrdup(optarg);
+ canonicalize_path(gtm_path);
+ break;
+ case 't':
+ wait_seconds = atoi(optarg);
+ break;
+ case 'w':
+ do_wait = true;
+ wait_set = true;
+ break;
+ case 'W':
+ do_wait = false;
+ wait_set = true;
+ break;
+ case 'Z':
+ gtm_app = xstrdup(optarg);
+ if (strcmp(gtm_app,"gtm_proxy") != 0
+ && strcmp(gtm_app,"gtm_standby") != 0
+ && strcmp(gtm_app,"gtm") != 0)
+ {
+ write_stderr(_("%s: %s launch name set not correct\n"), progname, gtm_app);
+ do_advice();
+ exit(1);
+ }
+ break;
+ default:
+ /* getopt_long already issued a suitable error message */
+ do_advice();
+ exit(1);
+ }
+ }
+
+ /* Process an action */
+ if (optind < argc)
+ {
+ if (ctl_command != NO_COMMAND)
+ {
+ write_stderr(_("%s: too many command-line arguments (first is \"%s\")\n"), progname, argv[optind]);
+ do_advice();
+ exit(1);
+ }
+
+ if (strcmp(argv[optind], "start") == 0)
+ ctl_command = START_COMMAND;
+ else if (strcmp(argv[optind], "stop") == 0)
+ ctl_command = STOP_COMMAND;
+ else if (strcmp(argv[optind], "promote") == 0)
+ ctl_command = PROMOTE_COMMAND;
+ else if (strcmp(argv[optind], "restart") == 0)
+ ctl_command = RESTART_COMMAND;
+ else if (strcmp(argv[optind], "status") == 0)
+ ctl_command = STATUS_COMMAND;
+ else if (strcmp(argv[optind], "reconnect") == 0)
+ ctl_command = RECONNECT_COMMAND;
+ else
+ {
+ write_stderr(_("%s: unrecognized operation mode \"%s\"\n"),
+ progname, argv[optind]);
+ do_advice();
+ exit(1);
+ }
+ optind++;
+ }
+ }
+
+ if (ctl_command == NO_COMMAND)
+ {
+ write_stderr(_("%s: no operation specified\n"), progname);
+ do_advice();
+ exit(1);
+ }
+
+ gtm_data = getenv("GTMDATA");
+
+ if (gtm_data)
+ {
+ gtm_data = xstrdup(gtm_data);
+ canonicalize_path(gtm_data);
+ }
+
+ if (!gtm_data)
+ {
+ write_stderr("%s: no GTM/GTM Proxy directory specified \n",
+ progname);
+ do_advice();
+ exit(1);
+ }
+
+ /*
+ * pid files of gtm and gtm proxy are named differently
+ * -Z option has also to be set for STOP_COMMAND
+ * or gtm_ctl will not be able to find the correct pid_file
+ */
+ if (!gtm_app)
+ {
+ write_stderr("%s: no launch option not specified\n",
+ progname);
+ do_advice();
+ exit(1);
+ }
+
+ if (strcmp(gtm_app,"gtm_proxy") != 0 &&
+ strcmp(gtm_app, "gtm_standby") != 0 &&
+ strcmp(gtm_app,"gtm") != 0)
+ {
+ write_stderr(_("%s: launch option incorrect\n"),
+ progname);
+ do_advice();
+ exit(1);
+ }
+
+ /* Check if GTM Proxy ID is set, this is not necessary when stopping */
+ if (ctl_command == START_COMMAND ||
+ ctl_command == RESTART_COMMAND)
+ {
+ /* Rebuild option string to include Proxy ID */
+ if (strcmp(gtm_app, "gtm_proxy") == 0)
+ {
+ gtmdata_opt = (char *) pg_realloc(gtmdata_opt, strlen(gtmdata_opt) + 9);
+ if (nodename)
+ sprintf(gtmdata_opt, "%s -i %s ", gtmdata_opt, nodename);
+ else
+ sprintf(gtmdata_opt, "%s ", gtmdata_opt);
+ }
+ }
+
+ if (!wait_set)
+ {
+ switch (ctl_command)
+ {
+ case RESTART_COMMAND:
+ case START_COMMAND:
+ case PROMOTE_COMMAND:
+ case STATUS_COMMAND:
+ do_wait = false;
+ break;
+ case STOP_COMMAND:
+ do_wait = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ /* Build strings for pid file and option file */
+ if (strcmp(gtm_app,"gtm_proxy") == 0)
+ {
+ snprintf(pid_file, MAXPGPATH, "%s/gtm_proxy.pid", gtm_data);
+ snprintf(gtmopts_file, MAXPGPATH, "%s/gtm_proxy.opts", gtm_data);
+ snprintf(conf_file, MAXPGPATH, "%s/gtm_proxy.conf", gtm_data);
+ }
+ else if (strcmp(gtm_app,"gtm") == 0)
+ {
+ snprintf(pid_file, MAXPGPATH, "%s/gtm.pid", gtm_data);
+ snprintf(gtmopts_file, MAXPGPATH, "%s/gtm.opts", gtm_data);
+ snprintf(conf_file, MAXPGPATH, "%s/gtm.conf", gtm_data);
+ }
+ else if (strcmp(gtm_app,"gtm_standby") == 0)
+ {
+ snprintf(pid_file, MAXPGPATH, "%s/gtm.pid", gtm_data);
+ snprintf(gtmopts_file, MAXPGPATH, "%s/gtm.opts", gtm_data);
+ snprintf(conf_file, MAXPGPATH, "%s/gtm.conf", gtm_data);
+ }
+
+ if (ctl_command==STATUS_COMMAND)
+ gtm_opts = xstrdup("-c");
+
+ switch (ctl_command)
+ {
+ case START_COMMAND:
+ do_start();
+ break;
+ case STOP_COMMAND:
+ do_stop();
+ break;
+ case PROMOTE_COMMAND:
+ do_promote();
+ break;
+ case RESTART_COMMAND:
+ do_restart();
+ break;
+ case STATUS_COMMAND:
+ do_status();
+ break;
+ case RECONNECT_COMMAND:
+ do_reconnect();
+ break;
+ default:
+ break;
+ }
+
+ exit(0);
+}
+
+/*
+ * Safer versions of standard realloc C library function. If an
+ * out-of-memory condition occurs, these functions will bail out
+ * safely; therefore, its return value is guaranteed to be non-NULL.
+ */
+static void *
+pg_realloc(void *ptr, size_t size)
+{
+ void *tmp;
+
+ tmp = realloc(ptr, size);
+ if (!tmp)
+ write_stderr("out of memory\n");
+ return tmp;
+}
diff --git a/src/gtm/libpq/Makefile b/src/gtm/libpq/Makefile
index 4cbd004628..dd22b0dcb1 100644
--- a/src/gtm/libpq/Makefile
+++ b/src/gtm/libpq/Makefile
@@ -11,17 +11,19 @@ top_builddir=../../..
include $(top_builddir)/src/Makefile.global
subdir=src/gtm/libpq
-include $(top_srcdir)/src/backend/common.mk
+NAME=pqcomm
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
-OBJS = ip.o pqcomm.o pqformat.o strlcpy.o pqsignal.o
+OBJS=ip.o pqcomm.o pqformat.o strlcpy.o pqsignal.o
-all: libgtmpq.a
+all:all-lib
-libgtmpq.a: $(OBJS)
- $(AR) $(AROPT) $@ $^
+include $(top_srcdir)/src/Makefile.shlib
clean:
- rm -f $(OBJS) libgtmpq.a
+ rm -f $(OBJS)
+ rm -f libpqcomm.so libpqcomm.so.1 libpqcomm.so.1.0
distclean: clean
diff --git a/src/gtm/libpq/pqcomm.c b/src/gtm/libpq/pqcomm.c
index 292b3c035b..d9a19a4707 100644
--- a/src/gtm/libpq/pqcomm.c
+++ b/src/gtm/libpq/pqcomm.c
@@ -297,16 +297,14 @@ StreamServerPort(int family, char *hostName, unsigned short portNumber,
continue;
}
-#define GTM_MAX_CONNECTIONS 1024
+#define GTM_MAX_CONNECTIONS 4096
/*
* Select appropriate accept-queue length limit. PG_SOMAXCONN is only
* intended to provide a clamp on the request on platforms where an
* overly large request provokes a kernel error (are there any?).
*/
- maxconn = GTM_MAX_CONNECTIONS * 2;
-
- err = listen(fd, maxconn);
+ err = listen(fd, GTM_MAX_CONNECTIONS);
if (err < 0)
{
ereport(LOG,
diff --git a/src/gtm/main/Makefile b/src/gtm/main/Makefile
index f85d977eac..d207e32dcc 100644
--- a/src/gtm/main/Makefile
+++ b/src/gtm/main/Makefile
@@ -15,38 +15,23 @@ ifneq ($(PORTNAME), win32)
override CFLAGS += $(PTHREAD_CFLAGS)
endif
-SUBDIRS = $(top_builddir)/src/gtm/client \
- $(top_builddir)/src/gtm/common \
- $(top_builddir)/src/gtm/config \
- $(top_builddir)/src/gtm/libpq \
- $(top_builddir)/src/gtm/path \
- $(top_builddir)/src/gtm/recovery
+OBJS=main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o gtm_time.o gtm_standby.o gtm_opt.o
-include $(top_srcdir)/src/backend/common.mk
+OTHERS= ../libpq/libpqcomm.a ../path/libgtmpath.a ../recovery/libgtmrecovery.a ../client/libgtmclient.a ../common/libgtm.a ../../port/libpgport.a
-OBJS = $(SUBDIROBJS) \
- $(top_builddir)/src/port/libpgport_srv.a \
- main.o gtm_thread.o gtm_txn.o gtm_seq.o gtm_snap.o gtm_time.o \
- gtm_standby.o gtm_opt.o register_gtm.o replication.o
+LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq
-LIBS += $(PTHREAD_LIBS)
+LIBS=-lpthread
-all: gtm
+gtm:$(OBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ $(OTHERS) -o gtm
-gtm: $(OBJS) | submake-libpgport
- $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $(call expand_subsys,$^) -o $@
+all:gtm
-install: all installdirs
- $(INSTALL_PROGRAM) gtm$(X) '$(DESTDIR)$(bindir)/gtm$(X)'
- $(INSTALL_DATA) $(srcdir)/gtm.conf.sample '$(DESTDIR)$(datadir)/gtm.conf.sample'
+clean:
+ rm -f $(OBJS)
+ rm -f gtm
-installdirs:
- $(MKDIR_P) '$(DESTDIR)$(bindir)' '$(DESTDIR)$(datadir)'
+distclean: clean
-uninstall:
- rm -f '$(DESTDIR)$(bindir)/gtm$(X)' '$(DESTDIR)$(datadir)/gtm.conf.sample'
-
-clean distclean maintainer-clean:
- rm -f gtm$(X) $(OBJS)
-
-$(top_builddir)/src/port/libpgport_srv.a: | submake-libpgport
+maintainer-clean: distclean
diff --git a/src/gtm/main/gtm_opt.c b/src/gtm/main/gtm_opt.c
index 4a0b20442f..f05be304cc 100644
--- a/src/gtm/main/gtm_opt.c
+++ b/src/gtm/main/gtm_opt.c
@@ -148,7 +148,7 @@ struct config_int ConfigureNamesInt[] =
0
},
&GTMPortNumber,
- 6666, 0, INT_MAX,
+ 0, 0, INT_MAX,
0, NULL
},
{
diff --git a/src/gtm/main/gtm_seq.c b/src/gtm/main/gtm_seq.c
index b8d999b200..a9f2dd1b0a 100644
--- a/src/gtm/main/gtm_seq.c
+++ b/src/gtm/main/gtm_seq.c
@@ -3,6 +3,11 @@
* gtm_seq.c
* Sequence handling on GTM
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -47,6 +52,9 @@ static int seq_add_seqinfo(GTM_SeqInfo *seqinfo);
static int seq_remove_seqinfo(GTM_SeqInfo *seqinfo);
static GTM_SequenceKey seq_copy_key(GTM_SequenceKey key);
static int seq_drop_with_dbkey(GTM_SequenceKey nsp);
+#ifdef XCP
+static GTM_Sequence get_rangemax(GTM_SeqInfo *seqinfo, GTM_Sequence range);
+#endif
/*
* Get the hash value given the sequence key
@@ -331,8 +339,14 @@ GTM_SeqOpen(GTM_SequenceKey seqkey,
*/
seqinfo->gs_cycle = cycle;
+#ifdef XCP
+ seqinfo->gs_max_lastvals = 0;
+ seqinfo->gs_lastval_count = 0;
+ seqinfo->gs_last_values = NULL;
+#else
/* Set the last value in case of a future restart */
seqinfo->gs_last_value = seqinfo->gs_init_value;
+#endif
if ((errcode = seq_add_seqinfo(seqinfo)))
{
@@ -386,14 +400,23 @@ int GTM_SeqAlter(GTM_SequenceKey seqkey,
{
/* Restart command has been used, reset the sequence */
seqinfo->gs_called = false;
+#ifdef XCP
+ seqinfo->gs_value = lastval;
+#else
seqinfo->gs_init_value = seqinfo->gs_last_value = lastval;
+#endif
}
+#ifdef XCP
+ if (seqinfo->gs_init_value != startval)
+ seqinfo->gs_init_value = startval;
+#else
else
{
/* Start has been used, reinitialize init value */
if (seqinfo->gs_init_value != startval)
seqinfo->gs_init_value = seqinfo->gs_last_value = startval;
}
+#endif
/* Remove the old key with the old name */
GTM_RWLockRelease(&seqinfo->gs_lock);
@@ -433,7 +456,14 @@ GTM_SeqRestore(GTM_SequenceKey seqkey,
seqinfo->gs_min_value = minval;
seqinfo->gs_max_value = maxval;
+#ifdef XCP
+ seqinfo->gs_init_value = startval;
+ seqinfo->gs_max_lastvals = 0;
+ seqinfo->gs_lastval_count = 0;
+ seqinfo->gs_last_values = NULL;
+#else
seqinfo->gs_init_value = seqinfo->gs_last_value = startval;
+#endif
seqinfo->gs_value = curval;
/*
@@ -561,7 +591,7 @@ seq_drop_with_dbkey(GTM_SequenceKey nsp)
/* Sequence is not is busy state, it can be deleted safely */
bucket->shb_list = gtm_list_delete_cell(bucket->shb_list, cell, prev);
- elog(LOG, "Sequence %s was deleted from GTM",
+ elog(DEBUG1, "Sequence %s was deleted from GTM",
curr_seqinfo->gs_key->gsk_key);
deleted = true;
@@ -625,7 +655,17 @@ GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey)
newseqinfo->gs_cycle = seqinfo->gs_cycle;
newseqinfo->gs_state = seqinfo->gs_state;
+#ifdef XCP
+ newseqinfo->gs_max_lastvals = seqinfo->gs_max_lastvals;
+ newseqinfo->gs_lastval_count = seqinfo->gs_lastval_count;
+ newseqinfo->gs_last_values = (GTM_SeqLastVal *)
+ MemoryContextAlloc(TopMostMemoryContext,
+ newseqinfo->gs_max_lastvals * sizeof(GTM_SeqLastVal));
+ memcpy(newseqinfo->gs_last_values, seqinfo->gs_last_values,
+ newseqinfo->gs_max_lastvals * sizeof(GTM_SeqLastVal));
+#else
newseqinfo->gs_last_value = seqinfo->gs_last_value;
+#endif
/* Add the copy to the list */
if ((errcode = seq_add_seqinfo(newseqinfo))) /* a lock is taken here for the new sequence */
@@ -648,6 +688,333 @@ GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey)
return errcode;
}
+#ifdef XCP
+/*
+ * Get current value for the sequence without incrementing it
+ */
+void
+GTM_SeqGetCurrent(GTM_SequenceKey seqkey, char *coord_name,
+ int coord_procid, GTM_Sequence *result)
+{
+ GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+ int i;
+ bool found = false;
+
+ elog(DEBUG1, "Look up last value of Sequence %s in session %s:%d",
+ seqkey->gsk_key, coord_name, coord_procid);
+
+ if (seqinfo == NULL)
+ {
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("sequence \"%s\" does not exist", seqkey->gsk_key)));
+ return;
+ }
+
+ GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_READ);
+
+ for (i = 0; i < seqinfo->gs_lastval_count; i++)
+ {
+ if (strcmp(seqinfo->gs_last_values[i].gs_coord_name, coord_name) == 0 &&
+ seqinfo->gs_last_values[i].gs_coord_procid == coord_procid)
+ {
+ *result = seqinfo->gs_last_values[i].gs_last_value;
+ found = true;
+ break;
+ }
+ }
+
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ seq_release_seqinfo(seqinfo);
+ if (!found)
+ ereport(ERROR,
+ (ERANGE,
+ errmsg("currval of sequence \"%s\" is not yet defined in this session",
+ seqkey->gsk_key)));
+
+}
+
+
+/*
+ * Store the sequence value as last for the specified distributed session
+ */
+static void
+seq_set_lastval(GTM_SeqInfo *seqinfo, char *coord_name,
+ int coord_procid, GTM_Sequence newval)
+{
+ GTM_SeqLastVal *lastval;
+ int i;
+
+ /* Can not assign value to not defined value */
+ if (coord_name == NULL || coord_procid == 0)
+ return;
+
+ elog(DEBUG1, "Remember last value of Sequence %s in session %s:%d",
+ seqinfo->gs_key->gsk_key, coord_name, coord_procid);
+
+ /*
+ * If last value is already defined for the session update it
+ */
+ for (i = 0; i < seqinfo->gs_lastval_count; i++)
+ {
+ if (strcmp(seqinfo->gs_last_values[i].gs_coord_name, coord_name) == 0 &&
+ seqinfo->gs_last_values[i].gs_coord_procid == coord_procid)
+ {
+ seqinfo->gs_last_values[i].gs_last_value = newval;
+ return;
+ }
+ }
+
+ /* Not found, add new entry for the distributed session */
+ if (seqinfo->gs_lastval_count == seqinfo->gs_max_lastvals)
+ {
+ /* Need more room */
+#define INIT_LASTVALS 16
+
+ if (seqinfo->gs_max_lastvals == 0)
+ {
+ /* No values at all, palloc memory block */
+ MemoryContext oldContext;
+ oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+ seqinfo->gs_last_values = (GTM_SeqLastVal *)
+ palloc(INIT_LASTVALS * sizeof(GTM_SeqLastVal));
+ seqinfo->gs_max_lastvals = INIT_LASTVALS;
+ MemoryContextSwitchTo(oldContext);
+ }
+ else
+ {
+ /* Increase existing array */
+ int newsize = seqinfo->gs_max_lastvals * 2;
+ seqinfo->gs_last_values = (GTM_SeqLastVal *)
+ repalloc(seqinfo->gs_last_values,
+ newsize * sizeof(GTM_SeqLastVal));
+ seqinfo->gs_max_lastvals = newsize;
+ }
+ }
+
+ /* Populate new entry */
+ lastval = &seqinfo->gs_last_values[seqinfo->gs_lastval_count++];
+ memcpy(lastval->gs_coord_name, coord_name, strlen(coord_name) + 1);
+ lastval->gs_coord_procid = coord_procid;
+ lastval->gs_last_value = newval;
+}
+
+
+/*
+ * Set values for the sequence
+ */
+int
+GTM_SeqSetVal(GTM_SequenceKey seqkey, char *coord_name,
+ int coord_procid, GTM_Sequence nextval, bool iscalled)
+{
+ GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+
+ if (seqinfo == NULL)
+ {
+ ereport(LOG,
+ (EINVAL,
+ errmsg("The sequence with the given key does not exist")));
+
+ return EINVAL;
+ }
+
+ GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+
+ seqinfo->gs_value = nextval;
+ seqinfo->gs_called = iscalled;
+
+ /* If sequence is not called, update last value for the session */
+ if (!iscalled)
+ seq_set_lastval(seqinfo, coord_name, coord_procid, nextval);
+
+ /* Remove the old key with the old name */
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ seq_release_seqinfo(seqinfo);
+
+ return 0;
+}
+
+/*
+ * Get next value for the sequence
+ */
+int
+GTM_SeqGetNext(GTM_SequenceKey seqkey, char *coord_name,
+ int coord_procid, GTM_Sequence range,
+ GTM_Sequence *result, GTM_Sequence *rangemax)
+{
+ GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+
+ if (seqinfo == NULL)
+ {
+ ereport(LOG,
+ (EINVAL,
+ errmsg("The sequence with the given key does not exist")));
+ return EINVAL;
+ }
+
+ GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+
+ /*
+ * If the sequence is called for the first time return the current value.
+ * It should be already initialized.
+ */
+ if (!SEQ_IS_CALLED(seqinfo))
+ {
+ *result = seqinfo->gs_value;
+ seqinfo->gs_called = true;
+ }
+ else
+ {
+ if (SEQ_IS_ASCENDING(seqinfo))
+ {
+ /*
+ * Check if the sequence is about to wrap-around. If the sequence
+ * does not support wrap-around, throw an error.
+ * Beware overflow!
+ */
+ if (seqinfo->gs_max_value - seqinfo->gs_increment_by
+ >= seqinfo->gs_value)
+ {
+ int newval = seqinfo->gs_value + seqinfo->gs_increment_by;
+ *result = seqinfo->gs_value = newval;
+ }
+ else if (SEQ_IS_CYCLE(seqinfo))
+ *result = seqinfo->gs_value = seqinfo->gs_min_value;
+ else
+ {
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ seq_release_seqinfo(seqinfo);
+ ereport(LOG,
+ (ERANGE,
+ errmsg("Sequence reached maximum value")));
+ return ERANGE;
+ }
+ }
+ else
+ {
+ /*
+ * Check if the sequence is about to wrap-around. If the sequence
+ * does not support wrap-around, throw an error.
+ * Beware overflow!
+ *
+ * Note: The gs_increment_by is a signed integer and is negative for
+ * descending sequences. So we don't need special handling below
+ */
+ if (seqinfo->gs_min_value - seqinfo->gs_increment_by
+ <= seqinfo->gs_value)
+ {
+ int newval = seqinfo->gs_value + seqinfo->gs_increment_by;
+ *result = seqinfo->gs_value = newval;
+ }
+ else if (SEQ_IS_CYCLE(seqinfo))
+ *result = seqinfo->gs_value = seqinfo->gs_max_value;
+ else
+ {
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ seq_release_seqinfo(seqinfo);
+ ereport(LOG,
+ (ERANGE,
+ errmsg("Sequence reached maximum value")));
+ return ERANGE;
+ }
+ }
+ }
+ /* if range is specified calculate valid max value for this range */
+ if (range > 1)
+ *rangemax = get_rangemax(seqinfo, range);
+ else
+ *rangemax = *result;
+ /*
+ * lastval has to be set to rangemax obtained above because
+ * values upto it will be consumed by this nextval caller and
+ * the next caller should get values starting above this
+ * lastval. Same reasoning for gs_value, but we still return
+ * result as the first calculated gs_value above to form the
+ * local starting seed at the caller. This will go upto the
+ * rangemax value before contacting GTM again..
+ */
+ seq_set_lastval(seqinfo, coord_name, coord_procid, *rangemax);
+ seqinfo->gs_value = *rangemax;
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ seq_release_seqinfo(seqinfo);
+ return 0;
+}
+
+/*
+ * Given a sequence and the requested range for its values, calculate
+ * the legitimate maximum permissible value for this range. In
+ * particular we need to be careful about overflow and underflow for
+ * mix and max types of sequences..
+ */
+static GTM_Sequence
+get_rangemax(GTM_SeqInfo *seqinfo, GTM_Sequence range)
+{
+ GTM_Sequence rangemax = seqinfo->gs_value;
+
+ /*
+ * Deduct 1 from range because the currval has been accounted
+ * for already before this call has been made
+ */
+ range--;
+ if (SEQ_IS_ASCENDING(seqinfo))
+ {
+ /*
+ * Check if the sequence will overflow because of the range
+ * request. If yes, cap it at close to or equal to max value
+ */
+ while (range != 0 &&
+ (seqinfo->gs_max_value - seqinfo->gs_increment_by >=
+ rangemax))
+ {
+ rangemax += seqinfo->gs_increment_by;
+ range--;
+ }
+ }
+ else
+ {
+ /*
+ * Check if the sequence will underflow because of the range
+ * request. If yes, cap it at close to or equal to min value
+ *
+ * Note: The gs_increment_by is a signed integer and is negative for
+ * descending sequences. So we don't need special handling below
+ */
+ while (range != 0 &&
+ (seqinfo->gs_min_value - seqinfo->gs_increment_by <=
+ rangemax))
+ {
+ rangemax += seqinfo->gs_increment_by;
+ range--;
+ }
+ }
+ return rangemax;
+}
+#else
+/*
+ * Get current value for the sequence without incrementing it
+ */
+GTM_Sequence
+GTM_SeqGetCurrent(GTM_SequenceKey seqkey)
+{
+ GTM_SeqInfo *seqinfo = seq_find_seqinfo(seqkey);
+ GTM_Sequence value;
+
+ if (seqinfo == NULL)
+ {
+ ereport(LOG,
+ (EINVAL,
+ errmsg("The sequence with the given key does not exist")));
+ return InvalidSequenceValue;
+ }
+
+ GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+
+ value = seqinfo->gs_last_value;
+
+ GTM_RWLockRelease(&seqinfo->gs_lock);
+ seq_release_seqinfo(seqinfo);
+ return value;
+}
/*
* Set values for the sequence
@@ -769,6 +1136,7 @@ GTM_SeqGetNext(GTM_SequenceKey seqkey)
seq_release_seqinfo(seqinfo);
return value;
}
+#endif
/*
* Reset the sequence
@@ -787,7 +1155,11 @@ GTM_SeqReset(GTM_SequenceKey seqkey)
}
GTM_RWLockAcquire(&seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+#ifdef XCP
+ seqinfo->gs_value = seqinfo->gs_init_value;
+#else
seqinfo->gs_value = seqinfo->gs_last_value = seqinfo->gs_init_value;
+#endif
GTM_RWLockRelease(&seqinfo->gs_lock);
seq_release_seqinfo(seqinfo);
@@ -856,7 +1228,7 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup)
MemoryContextSwitchTo(oldContext);
- elog(LOG, "Opening sequence %s", seqkey.gsk_key);
+ elog(DEBUG1, "Opening sequence %s", seqkey.gsk_key);
pq_getmsgend(message);
@@ -869,7 +1241,7 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling open_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+ elog(DEBUG1, "calling open_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
retry:
rc = bkup_open_sequence(GetMyThreadInfo->thr_conn->standby,
@@ -887,8 +1259,12 @@ ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup)
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "open_sequence() returns rc %d.", rc);
+ elog(DEBUG1, "open_sequence() returns rc %d.", rc);
}
+#ifdef XCP
+ /* Save control file with new seq info */
+ SaveControlInfo();
+#endif
/*
* Send a SUCCESS message back to the client
*/
@@ -961,7 +1337,7 @@ ProcessSequenceAlterCommand(Port *myport, StringInfo message, bool is_backup)
*/
oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
- elog(LOG, "Altering sequence key %s", seqkey.gsk_key);
+ elog(DEBUG1, "Altering sequence key %s", seqkey.gsk_key);
if ((errcode = GTM_SeqAlter(&seqkey, increment, minval, maxval, startval, lastval, cycle, is_restart)))
ereport(ERROR,
@@ -981,7 +1357,7 @@ ProcessSequenceAlterCommand(Port *myport, StringInfo message, bool is_backup)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling alter_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+ elog(DEBUG1, "calling alter_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
retry:
rc = bkup_alter_sequence(GetMyThreadInfo->thr_conn->standby,
@@ -1001,8 +1377,12 @@ ProcessSequenceAlterCommand(Port *myport, StringInfo message, bool is_backup)
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "alter_sequence() returns rc %d.", rc);
+ elog(DEBUG1, "alter_sequence() returns rc %d.", rc);
}
+#ifdef XCP
+ /* Save control file info */
+ SaveControlInfo();
+#endif
pq_beginmessage(&buf, 'S');
pq_sendint(&buf, SEQUENCE_ALTER_RESULT, 4);
if (myport->remote_type == GTM_NODE_GTM_PROXY)
@@ -1034,9 +1414,9 @@ void
ProcessSequenceListCommand(Port *myport, StringInfo message)
{
StringInfoData buf;
- int seq_count = 0;
- MemoryContext oldContext;
- GTM_SeqInfo *seq_list[1024]; /* FIXME: make it expandable. */
+ int seq_count;
+ int seq_maxcount;
+ GTM_SeqInfo **seq_list;
int i;
if (Recovery_IsStandby())
@@ -1044,39 +1424,57 @@ ProcessSequenceListCommand(Port *myport, StringInfo message)
(EPERM,
errmsg("Operation not permitted under the standby mode.")));
- memset(seq_list, 0, sizeof(GTM_SeqInfo *) * 1024);
-
- /*
- * We must use the TopMostMemoryContext because the sequence information is
- * not bound to a thread and can outlive any of the thread specific
- * contextes.
- */
- oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+ seq_count = 0;
+ seq_maxcount = 1024;
+ seq_list = (GTM_SeqInfo **) palloc(seq_maxcount * sizeof(GTM_SeqInfo *));;
/*
* Store pointers to all GTM_SeqInfo in the hash buckets into an array.
*/
+ for (i = 0 ; i < SEQ_HASH_TABLE_SIZE ; i++)
{
GTM_SeqInfoHashBucket *b;
gtm_ListCell *elem;
- for (i = 0 ; i < SEQ_HASH_TABLE_SIZE ; i++)
- {
- b = &GTMSequences[i];
+ b = &GTMSequences[i];
- GTM_RWLockAcquire(&b->shb_lock, GTM_LOCKMODE_READ);
+ GTM_RWLockAcquire(&b->shb_lock, GTM_LOCKMODE_READ);
- gtm_foreach(elem, b->shb_list)
+ gtm_foreach(elem, b->shb_list)
+ {
+ /* Allocate larger array if required */
+ if (seq_count == seq_maxcount)
{
- seq_list[seq_count] = (GTM_SeqInfo *) gtm_lfirst(elem);
- seq_count++;
+ int newcount;
+ GTM_SeqInfo **newlist;
+
+ newcount = 2 * seq_maxcount;
+ newlist = (GTM_SeqInfo **) repalloc(seq_list, newcount * sizeof(GTM_SeqInfo *));
+ /*
+ * If failed try to get less. It is unlikely to happen, but
+ * let's be safe.
+ */
+ while (newlist == NULL)
+ {
+ newcount = seq_maxcount + (newcount - seq_maxcount) / 2 - 1;
+ if (newcount <= seq_maxcount)
+ {
+ /* give up */
+ ereport(ERROR,
+ (ERANGE,
+ errmsg("Can not list all the sequences")));
+ }
+ newlist = (GTM_SeqInfo **) repalloc(seq_list, newcount * sizeof(GTM_SeqInfo *));
+ }
+ seq_maxcount = newcount;
+ seq_list = newlist;
}
-
- GTM_RWLockRelease(&b->shb_lock);
+ seq_list[seq_count] = (GTM_SeqInfo *) gtm_lfirst(elem);
+ seq_count++;
}
- }
- MemoryContextSwitchTo(oldContext);
+ GTM_RWLockRelease(&b->shb_lock);
+ }
pq_getmsgend(message);
@@ -1093,27 +1491,38 @@ ProcessSequenceListCommand(Port *myport, StringInfo message)
/* Send a number of sequences */
pq_sendint(&buf, seq_count, 4);
- for (i = 0 ; i < seq_count ; i++)
+ /*
+ * Send sequences from the array
+ */
{
- char *seq_buf;
- size_t seq_buflen;
-
- seq_buflen = gtm_get_sequence_size(seq_list[i]);
- seq_buf = (char *)malloc(seq_buflen);
+ /*
+ * TODO set initial size big enough to fit any sequence, and avoid
+ * reallocations.
+ */
+ size_t seq_maxlen = 256;
+ char *seq_buf = (char *) palloc(seq_maxlen);
- gtm_serialize_sequence(seq_list[i], seq_buf, seq_buflen);
+ for (i = 0 ; i < seq_count ; i++)
+ {
+ size_t seq_buflen = gtm_get_sequence_size(seq_list[i]);
+ if (seq_buflen > seq_maxlen)
+ {
+ seq_maxlen = seq_buflen;
+ seq_buf = (char *)repalloc(seq_buf, seq_maxlen);
+ }
- elog(LOG, "seq_buflen = %ld", seq_buflen);
+ gtm_serialize_sequence(seq_list[i], seq_buf, seq_buflen);
- pq_sendint(&buf, seq_buflen, 4);
- pq_sendbytes(&buf, seq_buf, seq_buflen);
+ elog(DEBUG1, "seq_buflen = %ld", seq_buflen);
- free(seq_buf);
+ pq_sendint(&buf, seq_buflen, 4);
+ pq_sendbytes(&buf, seq_buf, seq_buflen);
+ }
}
pq_endmessage(myport, &buf);
- elog(LOG, "ProcessSequenceListCommand() done.");
+ elog(DEBUG1, "ProcessSequenceListCommand() done.");
if (myport->remote_type != GTM_NODE_GTM_PROXY)
/* Don't flush to the backup because this does not change the internal status */
@@ -1122,6 +1531,85 @@ ProcessSequenceListCommand(Port *myport, StringInfo message)
/*
+ * Process MSG_SEQUENCE_GET_CURRENT message
+ */
+void
+ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message)
+{
+ GTM_SequenceKeyData seqkey;
+ StringInfoData buf;
+ GTM_Sequence seqval;
+#ifdef XCP
+ uint32 coord_namelen;
+ char *coord_name;
+ uint32 coord_procid;
+#endif
+
+ seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
+ seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+#ifdef XCP
+ coord_namelen = pq_getmsgint(message, sizeof(coord_namelen));
+ if (coord_namelen > 0)
+ coord_name = (char *)pq_getmsgbytes(message, coord_namelen);
+ else
+ coord_name = NULL;
+ coord_procid = pq_getmsgint(message, sizeof(coord_procid));
+
+ GTM_SeqGetCurrent(&seqkey, coord_name, coord_procid, &seqval);
+#else
+ seqval = GTM_SeqGetCurrent(&seqkey);
+ if (!SEQVAL_IS_VALID(seqval))
+ ereport(ERROR,
+ (ERANGE,
+ errmsg("Can not get current value of the sequence")));
+#endif
+
+ elog(DEBUG1, "Getting current value %ld for sequence %s", seqval, seqkey.gsk_key);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, SEQUENCE_GET_CURRENT_RESULT, 4);
+ if (myport->remote_type == GTM_NODE_GTM_PROXY)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendint(&buf, seqkey.gsk_keylen, 4);
+ pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
+ pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence));
+ pq_endmessage(myport, &buf);
+
+ if (myport->remote_type != GTM_NODE_GTM_PROXY)
+ /* Don't flush to the standby because this does not change the status */
+ pq_flush(myport);
+
+ /*
+ * I don't think backup is needed here. It does not change internal state.
+ * 27th Dec., 2011, K.Suzuki
+ */
+#if 0
+ if (GetMyThreadInfo->thr_conn->standby)
+ {
+ GTM_Sequence loc_seq;
+ GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
+ int count = 0;
+
+ elog(DEBUG1, "calling get_current() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+
+retry:
+ loc_seq = get_current(GetMyThreadInfo->thr_conn->standby, &seqkey);
+
+ if (gtm_standby_check_communication_error(&count, oldconn))
+ goto retry;
+
+ elog(DEBUG1, "get_current() returns GTM_Sequence %ld.", loc_seq);
+ }
+#endif
+
+ /* FIXME: need to check errors */
+}
+
+/*
* Process MSG_SEQUENCE_GET_NEXT/MSG_BKUP_SEQUENCE_GET_NEXT message
*
* is_backup indicates the message is MSG_BKUP_SEQUENCE_GET_NEXT
@@ -1132,17 +1620,40 @@ ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup)
GTM_SequenceKeyData seqkey;
StringInfoData buf;
GTM_Sequence seqval;
+#ifdef XCP
+ GTM_Sequence range;
+ GTM_Sequence rangemax;
+ uint32 coord_namelen;
+ char *coord_name;
+ uint32 coord_procid;
+#endif
seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+#ifdef XCP
+ coord_namelen = pq_getmsgint(message, sizeof(coord_namelen));
+ if (coord_namelen > 0)
+ coord_name = (char *)pq_getmsgbytes(message, coord_namelen);
+ else
+ coord_name = NULL;
+ coord_procid = pq_getmsgint(message, sizeof(coord_procid));
+ memcpy(&range, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
+ sizeof (GTM_Sequence));
+ if (GTM_SeqGetNext(&seqkey, coord_name, coord_procid, range,
+ &seqval, &rangemax))
+ ereport(ERROR,
+ (ERANGE,
+ errmsg("Can not get current value of the sequence")));
+#else
seqval = GTM_SeqGetNext(&seqkey);
if (!SEQVAL_IS_VALID(seqval))
ereport(ERROR,
(ERANGE,
errmsg("Can not get current value of the sequence")));
+#endif
- elog(LOG, "Getting next value %ld for sequence %s", seqval, seqkey.gsk_key);
+ elog(DEBUG1, "Getting next value %ld for sequence %s", seqval, seqkey.gsk_key);
if (!is_backup)
{
@@ -1153,10 +1664,16 @@ ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling get_next() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+ elog(DEBUG1, "calling get_next() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
retry:
+#ifdef XCP
+ bkup_get_next(GetMyThreadInfo->thr_conn->standby, &seqkey,
+ coord_name, coord_procid,
+ range, &loc_seq, &rangemax);
+#else
loc_seq = bkup_get_next(GetMyThreadInfo->thr_conn->standby, &seqkey);
+#endif
if (gtm_standby_check_communication_error(&count, oldconn))
goto retry;
@@ -1165,8 +1682,12 @@ ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup)
if (Backup_synchronously &&(myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "get_next() returns GTM_Sequence %ld.", loc_seq);
+ elog(DEBUG1, "get_next() returns GTM_Sequence %ld.", loc_seq);
}
+#ifdef XCP
+ /* Save control file info */
+ SaveControlInfo();
+#endif
/* Respond to the client */
pq_beginmessage(&buf, 'S');
pq_sendint(&buf, SEQUENCE_GET_NEXT_RESULT, 4);
@@ -1179,6 +1700,9 @@ ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup)
pq_sendint(&buf, seqkey.gsk_keylen, 4);
pq_sendbytes(&buf, seqkey.gsk_key, seqkey.gsk_keylen);
pq_sendbytes(&buf, (char *)&seqval, sizeof (GTM_Sequence));
+#ifdef XCP
+ pq_sendbytes(&buf, (char *)&rangemax, sizeof (GTM_Sequence));
+#endif
pq_endmessage(myport, &buf);
if (myport->remote_type != GTM_NODE_GTM_PROXY)
@@ -1207,12 +1731,25 @@ ProcessSequenceSetValCommand(Port *myport, StringInfo message, bool is_backup)
StringInfoData buf;
bool iscalled;
int errcode;
+#ifdef XCP
+ uint32 coord_namelen;
+ char *coord_name;
+ uint32 coord_procid;
+#endif
/*
* Get the sequence key
*/
seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
+#ifdef XCP
+ coord_namelen = pq_getmsgint(message, sizeof(coord_namelen));
+ if (coord_namelen > 0)
+ coord_name = (char *)pq_getmsgbytes(message, coord_namelen);
+ else
+ coord_name = NULL;
+ coord_procid = pq_getmsgint(message, sizeof(coord_procid));
+#endif
/* Read parameters to be set */
memcpy(&nextval, pq_getmsgbytes(message, sizeof (GTM_Sequence)),
@@ -1227,12 +1764,19 @@ ProcessSequenceSetValCommand(Port *myport, StringInfo message, bool is_backup)
*/
oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
- elog(LOG, "Setting new value %ld for sequence %s", nextval, seqkey.gsk_key);
+ elog(DEBUG1, "Setting new value %ld for sequence %s", nextval, seqkey.gsk_key);
+#ifdef XCP
+ if ((errcode = GTM_SeqSetVal(&seqkey, coord_name, coord_procid, nextval, iscalled)))
+ ereport(ERROR,
+ (errcode,
+ errmsg("Failed to set values of sequence")));
+#else
if ((errcode = GTM_SeqSetVal(&seqkey, nextval, iscalled)))
ereport(ERROR,
(errcode,
errmsg("Failed to set values of sequence")));
+#endif
MemoryContextSwitchTo(oldContext);
@@ -1247,13 +1791,22 @@ ProcessSequenceSetValCommand(Port *myport, StringInfo message, bool is_backup)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling set_val() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+ elog(DEBUG1, "calling set_val() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
retry:
+#ifdef XCP
+ rc = bkup_set_val(GetMyThreadInfo->thr_conn->standby,
+ &seqkey,
+ coord_name,
+ coord_procid,
+ nextval,
+ iscalled);
+#else
rc = bkup_set_val(GetMyThreadInfo->thr_conn->standby,
&seqkey,
nextval,
iscalled);
+#endif
if (gtm_standby_check_communication_error(&count, oldconn))
goto retry;
@@ -1262,8 +1815,12 @@ ProcessSequenceSetValCommand(Port *myport, StringInfo message, bool is_backup)
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "set_val() returns rc %d.", rc);
+ elog(DEBUG1, "set_val() returns rc %d.", rc);
}
+#ifdef XCP
+ /* Save control file info */
+ SaveControlInfo();
+#endif
/* Respond to the client */
pq_beginmessage(&buf, 'S');
pq_sendint(&buf, SEQUENCE_SET_VAL_RESULT, 4);
@@ -1304,7 +1861,7 @@ ProcessSequenceResetCommand(Port *myport, StringInfo message, bool is_backup)
seqkey.gsk_keylen = pq_getmsgint(message, sizeof (seqkey.gsk_keylen));
seqkey.gsk_key = (char *)pq_getmsgbytes(message, seqkey.gsk_keylen);
- elog(LOG, "Resetting sequence %s", seqkey.gsk_key);
+ elog(DEBUG1, "Resetting sequence %s", seqkey.gsk_key);
if ((errcode = GTM_SeqReset(&seqkey)))
ereport(ERROR,
@@ -1320,7 +1877,7 @@ ProcessSequenceResetCommand(Port *myport, StringInfo message, bool is_backup)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling reset_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+ elog(DEBUG1, "calling reset_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
retry:
rc = bkup_reset_sequence(GetMyThreadInfo->thr_conn->standby, &seqkey);
@@ -1332,8 +1889,12 @@ ProcessSequenceResetCommand(Port *myport, StringInfo message, bool is_backup)
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "reset_sequence() returns rc %d.", rc);
+ elog(DEBUG1, "reset_sequence() returns rc %d.", rc);
}
+#ifdef XCP
+ /* Save control file info */
+ SaveControlInfo();
+#endif
/* Respond to the client */
pq_beginmessage(&buf, 'S');
pq_sendint(&buf, SEQUENCE_RESET_RESULT, 4);
@@ -1376,7 +1937,7 @@ ProcessSequenceCloseCommand(Port *myport, StringInfo message, bool is_backup)
memcpy(&seqkey.gsk_type, pq_getmsgbytes(message, sizeof (GTM_SequenceKeyType)),
sizeof (GTM_SequenceKeyType));
- elog(LOG, "Closing sequence %s", seqkey.gsk_key);
+ elog(DEBUG1, "Closing sequence %s", seqkey.gsk_key);
if ((errcode = GTM_SeqClose(&seqkey)))
ereport(ERROR,
@@ -1392,7 +1953,7 @@ ProcessSequenceCloseCommand(Port *myport, StringInfo message, bool is_backup)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling close_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+ elog(DEBUG1, "calling close_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
retry:
rc = bkup_close_sequence(GetMyThreadInfo->thr_conn->standby, &seqkey);
@@ -1404,8 +1965,12 @@ ProcessSequenceCloseCommand(Port *myport, StringInfo message, bool is_backup)
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "close_sequence() returns rc %d.", rc);
+ elog(DEBUG1, "close_sequence() returns rc %d.", rc);
}
+#ifdef XCP
+ /* Save control file info */
+ SaveControlInfo();
+#endif
/* Respond to the client */
pq_beginmessage(&buf, 'S');
pq_sendint(&buf, SEQUENCE_CLOSE_RESULT, 4);
@@ -1459,7 +2024,7 @@ ProcessSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup)
*/
oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
- elog(LOG, "Renaming sequence %s to %s", seqkey.gsk_key, newseqkey.gsk_key);
+ elog(DEBUG1, "Renaming sequence %s to %s", seqkey.gsk_key, newseqkey.gsk_key);
if ((errcode = GTM_SeqRename(&seqkey, &newseqkey)))
ereport(ERROR,
@@ -1479,7 +2044,7 @@ ProcessSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling rename_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+ elog(DEBUG1, "calling rename_sequence() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
retry:
rc = bkup_rename_sequence(GetMyThreadInfo->thr_conn->standby, &seqkey, &newseqkey);
@@ -1491,8 +2056,12 @@ ProcessSequenceRenameCommand(Port *myport, StringInfo message, bool is_backup)
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "rename_sequence() returns rc %d.", rc);
+ elog(DEBUG1, "rename_sequence() returns rc %d.", rc);
}
+#ifdef XCP
+ /* Save control file info */
+ SaveControlInfo();
+#endif
/* Send a SUCCESS message back to the client */
pq_beginmessage(&buf, 'S');
pq_sendint(&buf, SEQUENCE_RENAME_RESULT, 4);
@@ -1752,3 +2321,67 @@ GTM_RestoreSeqInfo(FILE *ctlf)
state, cycle, called);
}
}
+
+
+#ifdef XCP
+/*
+ * Remove all current values allocated for the specified session from all
+ * sequences.
+ */
+void
+GTM_CleanupSeqSession(char *coord_name, int coord_procid)
+{
+ int i;
+
+ elog(DEBUG1, "Clean up Sequences used in session %s:%d",
+ coord_name, coord_procid);
+
+ for (i = 0; i < SEQ_HASH_TABLE_SIZE; i++)
+ {
+ GTM_SeqInfoHashBucket *bucket = &GTMSequences[i];
+ gtm_ListCell *elem;
+ GTM_SeqInfo *curr_seqinfo;
+
+ GTM_RWLockAcquire(&bucket->shb_lock, GTM_LOCKMODE_READ);
+
+ gtm_foreach(elem, bucket->shb_list)
+ {
+ int j;
+ curr_seqinfo = (GTM_SeqInfo *) gtm_lfirst(elem);
+ GTM_RWLockAcquire(&curr_seqinfo->gs_lock, GTM_LOCKMODE_WRITE);
+ if (curr_seqinfo->gs_state != SEQ_STATE_ACTIVE)
+ {
+ GTM_RWLockRelease(&curr_seqinfo->gs_lock);
+ continue;
+ }
+
+ for (j = 0; j < curr_seqinfo->gs_lastval_count; j++)
+ {
+ GTM_SeqLastVal *lastval = &curr_seqinfo->gs_last_values[j];
+ if (strcmp(lastval->gs_coord_name, coord_name) == 0 &&
+ lastval->gs_coord_procid == coord_procid)
+ {
+ int newcount = --curr_seqinfo->gs_lastval_count;
+ elog(DEBUG1, "remove value of Sequence %s acquired for session %s:%d",
+ curr_seqinfo->gs_key->gsk_key, lastval->gs_coord_name,
+ lastval->gs_coord_procid);
+ if (j < newcount)
+ memcpy(lastval, &curr_seqinfo->gs_last_values[newcount],
+ sizeof(GTM_SeqLastVal));
+ if (curr_seqinfo->gs_lastval_count == 0)
+ {
+ elog(DEBUG1, "Sequence %s is not used, free curr values memory",
+ curr_seqinfo->gs_key->gsk_key);
+ curr_seqinfo->gs_max_lastvals = 0;
+ pfree(curr_seqinfo->gs_last_values);
+ curr_seqinfo->gs_last_values = NULL;
+ }
+ break;
+ }
+ }
+ GTM_RWLockRelease(&curr_seqinfo->gs_lock);
+ }
+ GTM_RWLockRelease(&bucket->shb_lock);
+ }
+}
+#endif
diff --git a/src/gtm/main/gtm_snap.c b/src/gtm/main/gtm_snap.c
index c8bf718cb9..3ecd0d6dcc 100644
--- a/src/gtm/main/gtm_snap.c
+++ b/src/gtm/main/gtm_snap.c
@@ -304,7 +304,7 @@ ProcessGetSnapshotCommand(Port *myport, StringInfo message, bool get_gxid)
(EPROTO,
errmsg("Message does not contain valid GXID")));
memcpy(&gxid, data, sizeof(gxid));
- elog(LOG, "Received transaction ID %d for snapshot obtention", gxid);
+ elog(INFO, "Received transaction ID %d for snapshot obtention", gxid);
txn = GTM_GXIDToHandle(gxid);
}
else
@@ -459,7 +459,7 @@ ProcessGetSnapshotCommandMulti(Port *myport, StringInfo message)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
retry:
- elog(LOG, "calling snapshot_get_multi() for standby GTM %p.",
+ elog(DEBUG1, "calling snapshot_get_multi() for standby GTM %p.",
GetMyThreadInfo->thr_conn->standby);
_rc = snapshot_get_multi(GetMyThreadInfo->thr_conn->standby,
@@ -469,7 +469,7 @@ retry:
if (gtm_standby_check_communication_error(&count, oldconn))
goto retry;
- elog(LOG, "snapshot_get_multi() rc=%d done.", _rc);
+ elog(DEBUG1, "snapshot_get_multi() rc=%d done.", _rc);
}
#endif
diff --git a/src/gtm/main/gtm_standby.c b/src/gtm/main/gtm_standby.c
index c4baa7cf8f..741d5eaad2 100644
--- a/src/gtm/main/gtm_standby.c
+++ b/src/gtm/main/gtm_standby.c
@@ -18,15 +18,12 @@
#include "gtm/elog.h"
#include "gtm/gtm.h"
#include "gtm/gtm_c.h"
+#include "gtm/standby_utils.h"
#include "gtm/gtm_client.h"
#include "gtm/gtm_seq.h"
#include "gtm/gtm_serialize.h"
#include "gtm/gtm_utils.h"
-#include "gtm/libpq.h"
-#include "gtm/pqformat.h"
#include "gtm/register.h"
-#include "gtm/standby_utils.h"
-#include "gtm/stringinfo.h"
GTM_Conn *GTM_ActiveConn = NULL;
static char standbyHostName[NI_MAXHOST];
@@ -59,12 +56,12 @@ gtm_standby_start_startup(void)
int
gtm_standby_finish_startup(void)
{
- elog(LOG, "Closing a startup connection...");
+ elog(DEBUG1, "Closing a startup connection...");
GTMPQfinish(GTM_ActiveConn);
GTM_ActiveConn = NULL;
- elog(LOG, "A startup connection closed.");
+ elog(DEBUG1, "A startup connection closed.");
return 1;
}
@@ -76,36 +73,36 @@ gtm_standby_restore_next_gxid(void)
next_gxid = get_next_gxid(GTM_ActiveConn);
GTM_RestoreTxnInfo(NULL, next_gxid);
- elog(LOG, "Restoring the next GXID done.");
+ elog(DEBUG1, "Restoring the next GXID done.");
return 1;
}
int
gtm_standby_restore_sequence(void)
{
- GTM_SeqInfo *seq_list[1024];
+ GTM_SeqInfo *seq_list;
int num_seq;
int i;
/*
* Restore sequence data.
*/
- num_seq = get_sequence_list(GTM_ActiveConn, seq_list, 1024);
+ num_seq = get_sequence_list(GTM_ActiveConn, &seq_list);
for (i = 0; i < num_seq; i++)
{
- GTM_SeqRestore(seq_list[i]->gs_key,
- seq_list[i]->gs_increment_by,
- seq_list[i]->gs_min_value,
- seq_list[i]->gs_max_value,
- seq_list[i]->gs_init_value,
- seq_list[i]->gs_value,
- seq_list[i]->gs_state,
- seq_list[i]->gs_cycle,
- seq_list[i]->gs_called);
+ GTM_SeqRestore(seq_list[i].gs_key,
+ seq_list[i].gs_increment_by,
+ seq_list[i].gs_min_value,
+ seq_list[i].gs_max_value,
+ seq_list[i].gs_init_value,
+ seq_list[i].gs_value,
+ seq_list[i].gs_state,
+ seq_list[i].gs_cycle,
+ seq_list[i].gs_called);
}
- elog(LOG, "Restoring sequences done.");
+ elog(DEBUG1, "Restoring sequences done.");
return 1;
}
@@ -194,7 +191,7 @@ gtm_standby_restore_gxid(void)
GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
- elog(LOG, "Restoring %d gxid(s) done.", num_txn);
+ elog(DEBUG1, "Restoring %d gxid(s) done.", num_txn);
return 1;
}
@@ -222,7 +219,7 @@ gtm_standby_restore_node(void)
for (i = 0; i < num_node; i++)
{
- elog(LOG, "get_node_list: nodetype=%d, nodename=%s, datafolder=%s",
+ elog(DEBUG1, "get_node_list: nodetype=%d, nodename=%s, datafolder=%s",
data[i].type, data[i].nodename, data[i].datafolder);
if (Recovery_PGXCNodeRegister(data[i].type, data[i].nodename, data[i].port,
data[i].proxyname, data[i].status,
@@ -254,7 +251,7 @@ gtm_standby_register_self(const char *node_name, int port, const char *datadir)
{
int rc;
- elog(LOG, "Registering standby-GTM status...");
+ elog(DEBUG1, "Registering standby-GTM status...");
node_get_local_addr(GTM_ActiveConn, standbyHostName, sizeof(standbyNodeName), &rc);
if (rc != 0)
@@ -269,11 +266,11 @@ gtm_standby_register_self(const char *node_name, int port, const char *datadir)
standbyNodeName, standbyDataDir, NODE_DISCONNECTED);
if (rc < 0)
{
- elog(LOG, "Failed to register a standby-GTM status.");
+ elog(DEBUG1, "Failed to register a standby-GTM status.");
return 0;
}
- elog(LOG, "Registering standby-GTM done.");
+ elog(DEBUG1, "Registering standby-GTM done.");
return 1;
}
@@ -288,12 +285,12 @@ gtm_standby_activate_self(void)
{
int rc;
- elog(LOG, "Updating the standby-GTM status to \"CONNECTED\"...");
+ elog(DEBUG1, "Updating the standby-GTM status to \"CONNECTED\"...");
rc = node_unregister(GTM_ActiveConn, GTM_NODE_GTM, standbyNodeName);
if (rc < 0)
{
- elog(LOG, "Failed to unregister old standby-GTM status.");
+ elog(DEBUG1, "Failed to unregister old standby-GTM status.");
return 0;
}
@@ -302,11 +299,11 @@ gtm_standby_activate_self(void)
if (rc < 0)
{
- elog(LOG, "Failed to register a new standby-GTM status.");
+ elog(DEBUG1, "Failed to register a new standby-GTM status.");
return 0;
}
- elog(LOG, "Updating the standby-GTM status done.");
+ elog(DEBUG1, "Updating the standby-GTM status done.");
return 1;
}
@@ -329,7 +326,7 @@ find_standby_node_info(void)
for (i = 0 ; i < n ; i++)
{
- elog(LOG, "pgxcnode_find_by_type: nodename=%s, type=%d, ipaddress=%s, port=%d, status=%d",
+ elog(DEBUG1, "pgxcnode_find_by_type: nodename=%s, type=%d, ipaddress=%s, port=%d, status=%d",
node[i]->nodename,
node[i]->type,
node[i]->ipaddress,
@@ -378,11 +375,11 @@ gtm_standby_connect_to_standby_int(int *report_needed)
if (!n)
{
- elog(LOG, "Any GTM standby node not found in registered node(s).");
+ elog(DEBUG1, "Any GTM standby node not found in registered node(s).");
return NULL;
}
- elog(LOG, "GTM standby is active. Going to connect.");
+ elog(DEBUG1, "GTM standby is active. Going to connect.");
*report_needed = 1;
snprintf(conn_string, sizeof(conn_string),
@@ -393,11 +390,11 @@ gtm_standby_connect_to_standby_int(int *report_needed)
if ( !standby )
{
- elog(LOG, "Failed to establish a connection with GTM standby. - %p", n);
+ elog(DEBUG1, "Failed to establish a connection with GTM standby. - %p", n);
return NULL;
}
- elog(LOG, "Connection established with GTM standby. - %p", n);
+ elog(DEBUG1, "Connection established with GTM standby. - %p", n);
return standby;
}
@@ -427,13 +424,13 @@ gtm_standby_reconnect_to_standby(GTM_Conn *old_conn, int retry_max)
for (i = 0; i < retry_max; i++)
{
- elog(LOG, "gtm_standby_reconnect_to_standby(): going to re-connect. retry=%d", i);
+ elog(DEBUG1, "gtm_standby_reconnect_to_standby(): going to re-connect. retry=%d", i);
newconn = gtm_standby_connect_to_standby_int(&report);
if (newconn != NULL)
break;
- elog(LOG, "gtm_standby_reconnect_to_standby(): re-connect failed. retry=%d", i);
+ elog(DEBUG1, "gtm_standby_reconnect_to_standby(): re-connect failed. retry=%d", i);
}
return newconn;
@@ -465,7 +462,7 @@ gtm_standby_check_communication_error(int *retry_count, GTM_Conn *oldconn)
return true;
}
- elog(LOG, "communication error with standby.");
+ elog(DEBUG1, "communication error with standby.");
}
return false;
}
@@ -495,7 +492,7 @@ gtm_standby_finishActiveConn(void)
elog(DEBUG3, "Error in connection");
return;
}
- elog(LOG, "Connection established to the GTM active.");
+ elog(DEBUG1, "Connection established to the GTM active.");
/* Unregister self from Active-GTM */
node_unregister(GTM_ActiveConn, GTM_NODE_GTM, NodeName);
@@ -518,47 +515,3 @@ gtm_standby_connectToActiveGTM(void)
return PQconnectGTM(connect_string);
}
-
-void
-ProcessGTMBeginBackup(Port *myport, StringInfo message)
-{
- int ii;
- GTM_ThreadInfo *my_threadinfo;
- StringInfoData buf;
-
- pq_getmsgend(message);
- my_threadinfo = GetMyThreadInfo;
-
- for (ii = 0; ii < GTMThreads->gt_array_size; ii++)
- {
- if (GTMThreads->gt_threads[ii] && GTMThreads->gt_threads[ii] != my_threadinfo)
- GTM_RWLockAcquire(&GTMThreads->gt_threads[ii]->thr_lock, GTM_LOCKMODE_WRITE);
- }
- my_threadinfo->thr_status = GTM_THREAD_BACKUP;
- pq_beginmessage(&buf, 'S');
- pq_sendint(&buf, BEGIN_BACKUP_RESULT, 4);
- pq_endmessage(myport, &buf);
- pq_flush(myport);
-}
-
-void
-ProcessGTMEndBackup(Port *myport, StringInfo message)
-{
- int ii;
- GTM_ThreadInfo *my_threadinfo;
- StringInfoData buf;
-
- pq_getmsgend(message);
- my_threadinfo = GetMyThreadInfo;
-
- for (ii = 0; ii < GTMThreads->gt_array_size; ii++)
- {
- if (GTMThreads->gt_threads[ii] && GTMThreads->gt_threads[ii] != my_threadinfo)
- GTM_RWLockRelease(&GTMThreads->gt_threads[ii]->thr_lock);
- }
- my_threadinfo->thr_status = GTM_THREAD_RUNNING;
- pq_beginmessage(&buf, 'S');
- pq_sendint(&buf, END_BACKUP_RESULT, 4);
- pq_endmessage(myport, &buf);
- pq_flush(myport);
-}
diff --git a/src/gtm/main/gtm_thread.c b/src/gtm/main/gtm_thread.c
index d0329a8169..4612023ab8 100644
--- a/src/gtm/main/gtm_thread.c
+++ b/src/gtm/main/gtm_thread.c
@@ -262,7 +262,7 @@ GTM_ThreadCleanup(void *argp)
{
GTM_ThreadInfo *thrinfo = (GTM_ThreadInfo *)argp;
- elog(LOG, "Cleaning up thread state");
+ elog(DEBUG1, "Cleaning up thread state");
if (thrinfo->thr_status == GTM_THREAD_BACKUP)
{
@@ -280,7 +280,7 @@ GTM_ThreadCleanup(void *argp)
*/
if (thrinfo->thr_conn->standby)
{
- elog(LOG, "Closing a connection to the GTM standby.");
+ elog(DEBUG1, "Closing a connection to the GTM standby.");
GTMPQfinish(thrinfo->thr_conn->standby);
thrinfo->thr_conn->standby = NULL;
@@ -291,6 +291,14 @@ GTM_ThreadCleanup(void *argp)
*/
StreamClose(thrinfo->thr_conn->con_port->sock);
+ /* Free the node_name in the port */
+ if (thrinfo->thr_conn->con_port->node_name != NULL)
+ /*
+ * We don't have to reset pointer to NULL her because ConnFree()
+ * frees this structure next.
+ */
+ pfree(thrinfo->thr_conn->con_port->node_name);
+
/* Free the port */
ConnFree(thrinfo->thr_conn->con_port);
thrinfo->thr_conn->con_port = NULL;
diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c
index 900569e510..475a861458 100644
--- a/src/gtm/main/gtm_txn.c
+++ b/src/gtm/main/gtm_txn.c
@@ -3,6 +3,11 @@
* gtm_txn.c
* Transaction handling
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -39,6 +44,11 @@ static void init_GTM_TransactionInfo(GTM_TransactionInfo *gtm_txninfo,
GTM_IsolationLevel isolevel,
GTMProxy_ConnID connid,
bool readonly);
+static void clean_GTM_TransactionInfo(GTM_TransactionInfo *gtm_txninfo);
+
+#ifdef XCP
+GlobalTransactionId ControlXid; /* last one written to control file */
+#endif
GTM_Transactions GTMTransactions;
void
@@ -100,6 +110,10 @@ GTM_InitTxnManager(void)
GTMTransactions.gt_gtm_state = GTM_STARTING;
+#ifdef XCP
+ ControlXid = FirstNormalGlobalTransactionId;
+#endif
+
return;
}
@@ -155,7 +169,12 @@ GTM_GXIDToHandle(GlobalTransactionId gxid)
if (gtm_txninfo != NULL)
return gtm_txninfo->gti_handle;
else
+ {
+ ereport(WARNING,
+ (ERANGE, errmsg("No transaction handle for gxid: %d",
+ gxid)));
return InvalidTransactionHandle;
+ }
}
/*
@@ -259,21 +278,7 @@ GTM_RemoveTransInfoMulti(GTM_TransactionInfo *gtm_txninfo[], int txn_count)
/*
* Now mark the transaction as aborted and mark the structure as not-in-use
*/
- gtm_txninfo[ii]->gti_state = GTM_TXN_ABORTED;
- gtm_txninfo[ii]->gti_in_use = false;
- gtm_txninfo[ii]->gti_snapshot_set = false;
-
- /* Clean-up also structures that were used for prepared transactions */
- if (gtm_txninfo[ii]->gti_gid)
- {
- pfree(gtm_txninfo[ii]->gti_gid);
- gtm_txninfo[ii]->gti_gid = NULL;
- }
- if (gtm_txninfo[ii]->nodestring)
- {
- pfree(gtm_txninfo[ii]->nodestring);
- gtm_txninfo[ii]->nodestring = NULL;
- }
+ clean_GTM_TransactionInfo(gtm_txninfo[ii]);
}
GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
@@ -328,20 +333,7 @@ GTM_RemoveAllTransInfos(int backend_id)
/*
* Now mark the transaction as aborted and mark the structure as not-in-use
*/
- gtm_txninfo->gti_state = GTM_TXN_ABORTED;
- gtm_txninfo->gti_in_use = false;
- gtm_txninfo->gti_snapshot_set = false;
-
- if (gtm_txninfo->gti_gid)
- {
- pfree(gtm_txninfo->gti_gid);
- gtm_txninfo->gti_gid = NULL;
- }
- if (gtm_txninfo->nodestring)
- {
- pfree(gtm_txninfo->nodestring);
- gtm_txninfo->nodestring = NULL;
- }
+ clean_GTM_TransactionInfo(gtm_txninfo);
/* move to next cell in the list */
if (prev)
@@ -504,6 +496,9 @@ GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count)
GlobalTransactionId xid, start_xid = InvalidGlobalTransactionId;
GTM_TransactionInfo *gtm_txninfo = NULL;
int ii;
+#ifdef XCP
+ bool save_control = false;
+#endif
if (Recovery_IsStandby())
{
@@ -578,12 +573,29 @@ GTM_GetGlobalTransactionIdMulti(GTM_TransactionHandle handle[], int txn_count)
gtm_txninfo = GTM_HandleToTransactionInfo(handle[ii]);
Assert(gtm_txninfo);
- elog(LOG, "Assigning new transaction ID = %d", xid);
+ elog(INFO, "Assigning new transaction ID = %d", xid);
gtm_txninfo->gti_gxid = xid;
}
+#ifdef XCP
+ /* Periodically write the xid and sequence info out to the control file.
+ * Try and handle wrapping, too.
+ */
+ if (xid - ControlXid > CONTROL_INTERVAL || xid < ControlXid)
+ {
+ save_control = true;
+ ControlXid = xid;
+ }
+#endif
+
GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+#ifdef XCP
+ /* save control info when not holding the XidGenLock */
+ if (save_control)
+ SaveControlInfo();
+#endif
+
return start_xid;
}
@@ -659,6 +671,8 @@ GTM_BeginTransactionMulti(char *coord_name,
*/
oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+ GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
+
for (kk = 0; kk < txn_count; kk++)
{
int ii, jj, startslot;
@@ -667,8 +681,6 @@ GTM_BeginTransactionMulti(char *coord_name,
* We had no cached slots. Now find a free slot in the transation array
* and store the transaction info structure there
*/
- GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
-
startslot = GTMTransactions.gt_lastslot + 1;
if (startslot >= GTM_MAX_GLOBAL_TRANSACTIONS)
startslot = 0;
@@ -737,7 +749,7 @@ init_GTM_TransactionInfo(GTM_TransactionInfo *gtm_txninfo,
gtm_txninfo->gti_gxid = InvalidGlobalTransactionId;
gtm_txninfo->gti_xmin = InvalidGlobalTransactionId;
gtm_txninfo->gti_state = GTM_TXN_STARTING;
- gtm_txninfo->gti_coordname = pstrdup(coord_name);
+ gtm_txninfo->gti_coordname = (coord_name ? pstrdup(coord_name) : NULL);
gtm_txninfo->gti_isolevel = isolevel;
gtm_txninfo->gti_readonly = readonly;
@@ -753,6 +765,35 @@ init_GTM_TransactionInfo(GTM_TransactionInfo *gtm_txninfo,
}
+/*
+ * Clean up the TransactionInfo slot and pfree all the palloc'ed memory,
+ * except txid array of the snapshot, which is reused.
+ */
+static void
+clean_GTM_TransactionInfo(GTM_TransactionInfo *gtm_txninfo)
+{
+ gtm_txninfo->gti_state = GTM_TXN_ABORTED;
+ gtm_txninfo->gti_in_use = false;
+ gtm_txninfo->gti_snapshot_set = false;
+
+ if (gtm_txninfo->gti_coordname)
+ {
+ pfree(gtm_txninfo->gti_coordname);
+ gtm_txninfo->gti_coordname = NULL;
+ }
+ if (gtm_txninfo->gti_gid)
+ {
+ pfree(gtm_txninfo->gti_gid);
+ gtm_txninfo->gti_gid = NULL;
+ }
+ if (gtm_txninfo->nodestring)
+ {
+ pfree(gtm_txninfo->nodestring);
+ gtm_txninfo->nodestring = NULL;
+ }
+}
+
+
void
GTM_BkupBeginTransactionMulti(char *coord_name,
GTM_TransactionHandle *txn,
@@ -1169,7 +1210,7 @@ ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message)
MemoryContextSwitchTo(oldContext);
- elog(LOG, "Sending transaction id %u", gxid);
+ elog(DEBUG1, "Sending transaction id %u", gxid);
/* Backup first */
if (GetMyThreadInfo->thr_conn->standby)
@@ -1177,7 +1218,7 @@ ProcessBeginTransactionGetGXIDCommand(Port *myport, StringInfo message)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling begin_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+ elog(DEBUG1, "calling begin_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
retry:
bkup_begin_transaction_gxid(GetMyThreadInfo->thr_conn->standby,
@@ -1229,7 +1270,13 @@ GTM_BkupBeginTransactionGetGXIDMulti(char *coord_name,
int ii;
MemoryContext oldContext;
- oldContext = MemoryContextSwitchTo(TopMemoryContext);
+#ifdef XCP
+ bool save_control = false;
+ GlobalTransactionId xid;
+#endif
+
+ oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+ //XCPTODO check oldContext = MemoryContextSwitchTo(TopMemoryContext);
GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
for (ii = 0; ii < txn_count; ii++)
@@ -1254,10 +1301,29 @@ GTM_BkupBeginTransactionGetGXIDMulti(char *coord_name,
if (!GlobalTransactionIdIsValid(GTMTransactions.gt_nextXid)) /* Handle wrap around too */
GTMTransactions.gt_nextXid = FirstNormalGlobalTransactionId;
GTMTransactions.gt_open_transactions = gtm_lappend(GTMTransactions.gt_open_transactions, gtm_txninfo);
+ xid = GTMTransactions.gt_nextXid;
}
+#ifdef XCP
+ /* Periodically write the xid and sequence info out to the control file.
+ * Try and handle wrapping, too.
+ */
+ if (xid - ControlXid > CONTROL_INTERVAL || xid < ControlXid)
+ {
+ save_control = true;
+ ControlXid = xid;
+ }
+#endif
+
GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+
+#ifdef XCP
+ /* save control info when not holding the XidGenLock */
+ if (save_control)
+ SaveControlInfo();
+#endif
+
MemoryContextSwitchTo(oldContext);
}
@@ -1365,7 +1431,7 @@ ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling begin_transaction_autovacuum() for standby GTM %p.",
+ elog(DEBUG1, "calling begin_transaction_autovacuum() for standby GTM %p.",
GetMyThreadInfo->thr_conn->standby);
retry:
@@ -1379,7 +1445,7 @@ ProcessBeginTransactionGetGXIDAutovacuumCommand(Port *myport, StringInfo message
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "begin_transaction_autovacuum() GXID=%d done.", _gxid);
+ elog(DEBUG1, "begin_transaction_autovacuum() GXID=%d done.", _gxid);
}
/* Respond to the client */
pq_beginmessage(&buf, 'S');
@@ -1439,9 +1505,9 @@ ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message)
/*
* Start a new transaction
*
- * XXX Port should contain Coordinator name - replace "" with that
+ * XXX Port should contain Coordinator name - replace NULL with that
*/
- count = GTM_BeginTransactionMulti("", txn_isolation_level, txn_read_only, txn_connid,
+ count = GTM_BeginTransactionMulti(NULL, txn_isolation_level, txn_read_only, txn_connid,
txn_count, txn);
if (count != txn_count)
ereport(ERROR,
@@ -1459,11 +1525,11 @@ ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message)
/* GXID has been received, now it's time to get a GTM timestamp */
timestamp = GTM_TimestampGetCurrent();
- end_gxid = start_gxid + txn_count;
+ end_gxid = start_gxid + (txn_count - 1);
if (end_gxid < start_gxid)
end_gxid += FirstNormalGlobalTransactionId;
- elog(LOG, "Sending transaction ids from %u to %u", start_gxid, end_gxid);
+ elog(DEBUG1, "Sending transaction ids from %u to %u", start_gxid, end_gxid);
/* Backup first */
if (GetMyThreadInfo->thr_conn->standby)
@@ -1472,7 +1538,7 @@ ProcessBeginTransactionGetGXIDCommandMulti(Port *myport, StringInfo message)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling begin_transaction_multi() for standby GTM %p.",
+ elog(DEBUG1, "calling begin_transaction_multi() for standby GTM %p.",
GetMyThreadInfo->thr_conn->standby);
retry:
@@ -1491,7 +1557,7 @@ retry:
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "begin_transaction_multi() rc=%d done.", _rc);
+ elog(DEBUG1, "begin_transaction_multi() rc=%d done.", _rc);
}
/* Respond to the client */
pq_beginmessage(&buf, 'S');
@@ -1589,7 +1655,7 @@ ProcessCommitTransactionCommand(Port *myport, StringInfo message, bool is_backup
oldContext = MemoryContextSwitchTo(TopMemoryContext);
- elog(LOG, "Committing transaction id %u", gxid);
+ elog(DEBUG1, "Committing transaction id %u", gxid);
/*
* Commit the transaction
@@ -1609,7 +1675,7 @@ ProcessCommitTransactionCommand(Port *myport, StringInfo message, bool is_backup
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling commit_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+ elog(DEBUG1, "calling commit_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
retry:
_rc = bkup_commit_transaction(GetMyThreadInfo->thr_conn->standby, gxid);
@@ -1621,7 +1687,7 @@ ProcessCommitTransactionCommand(Port *myport, StringInfo message, bool is_backup
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "commit_transaction() rc=%d done.", _rc);
+ elog(DEBUG1, "commit_transaction() rc=%d done.", _rc);
}
pq_beginmessage(&buf, 'S');
@@ -1697,7 +1763,7 @@ ProcessCommitPreparedTransactionCommand(Port *myport, StringInfo message, bool i
oldContext = MemoryContextSwitchTo(TopMemoryContext);
- elog(LOG, "Committing: prepared id %u and commit prepared id %u ", gxid[0], gxid[1]);
+ elog(DEBUG1, "Committing: prepared id %u and commit prepared id %u ", gxid[0], gxid[1]);
/*
* Commit the prepared transaction.
@@ -1715,7 +1781,7 @@ ProcessCommitPreparedTransactionCommand(Port *myport, StringInfo message, bool i
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling commit_prepared_transaction() for standby GTM %p.",
+ elog(DEBUG1, "calling commit_prepared_transaction() for standby GTM %p.",
GetMyThreadInfo->thr_conn->standby);
retry:
@@ -1729,7 +1795,7 @@ ProcessCommitPreparedTransactionCommand(Port *myport, StringInfo message, bool i
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "commit_prepared_transaction() rc=%d done.", _rc);
+ elog(DEBUG1, "commit_prepared_transaction() rc=%d done.", _rc);
}
/* Respond to the client */
pq_beginmessage(&buf, 'S');
@@ -1858,7 +1924,7 @@ ProcessGetGIDDataTransactionCommand(Port *myport, StringInfo message)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling get_gid_data() for standby GTM %p.",
+ elog(DEBUG1, "calling get_gid_data() for standby GTM %p.",
GetMyThreadInfo->thr_conn->standby);
retry:
@@ -1872,7 +1938,7 @@ retry:
if (gtm_standby_check_communication_error(&count, oldconn))
goto retry;
- elog(LOG, "get_gid_data() rc=%d done.", _rc);
+ elog(DEBUG1, "get_gid_data() rc=%d done.", _rc);
}
#endif
@@ -1909,7 +1975,7 @@ ProcessGXIDListCommand(Port *myport, StringInfo message)
actlen = gtm_serialize_transactions(&GTMTransactions, data, estlen);
- elog(LOG, "gtm_serialize_transactions: estlen=%ld, actlen=%ld", estlen, actlen);
+ elog(DEBUG1, "gtm_serialize_transactions: estlen=%ld, actlen=%ld", estlen, actlen);
GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
@@ -1935,10 +2001,10 @@ ProcessGXIDListCommand(Port *myport, StringInfo message)
if (myport->remote_type != GTM_NODE_GTM_PROXY)
{
pq_flush(myport);
- elog(LOG, "pq_flush()");
+ elog(DEBUG1, "pq_flush()");
}
- elog(LOG, "ProcessGXIDListCommand() ok. %ld bytes sent. len=%d", actlen, buf.len);
+ elog(DEBUG1, "ProcessGXIDListCommand() ok. %ld bytes sent. len=%d", actlen, buf.len);
free(data);
return;
@@ -1986,7 +2052,7 @@ ProcessRollbackTransactionCommand(Port *myport, StringInfo message, bool is_back
oldContext = MemoryContextSwitchTo(TopMemoryContext);
- elog(LOG, "Cancelling transaction id %u", gxid);
+ elog(DEBUG1, "Cancelling transaction id %u", gxid);
/*
* Commit the transaction
@@ -2003,7 +2069,7 @@ ProcessRollbackTransactionCommand(Port *myport, StringInfo message, bool is_back
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling abort_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+ elog(DEBUG1, "calling abort_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
retry:
bkup_abort_transaction(GetMyThreadInfo->thr_conn->standby, gxid);
@@ -2015,7 +2081,7 @@ ProcessRollbackTransactionCommand(Port *myport, StringInfo message, bool is_back
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "abort_transaction() GXID=%d done.", gxid);
+ elog(DEBUG1, "abort_transaction() GXID=%d done.", gxid);
}
/* Respond to the client */
pq_beginmessage(&buf, 'S');
@@ -2108,7 +2174,7 @@ ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message, bool is_b
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling commit_transaction_multi() for standby GTM %p.",
+ elog(DEBUG1, "calling commit_transaction_multi() for standby GTM %p.",
GetMyThreadInfo->thr_conn->standby);
retry:
@@ -2120,7 +2186,7 @@ ProcessCommitTransactionCommandMulti(Port *myport, StringInfo message, bool is_b
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "commit_transaction_multi() rc=%d done.", _rc);
+ elog(DEBUG1, "commit_transaction_multi() rc=%d done.", _rc);
}
/* Respond to the client */
pq_beginmessage(&buf, 'S');
@@ -2211,7 +2277,7 @@ ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message, bool is
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling abort_transaction_multi() for standby GTM %p.",
+ elog(DEBUG1, "calling abort_transaction_multi() for standby GTM %p.",
GetMyThreadInfo->thr_conn->standby);
retry:
@@ -2224,7 +2290,7 @@ ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message, bool is
if (Backup_synchronously &&(myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "abort_transaction_multi() rc=%d done.", _rc);
+ elog(DEBUG1, "abort_transaction_multi() rc=%d done.", _rc);
}
/* Respond to the client */
pq_beginmessage(&buf, 'S');
@@ -2325,7 +2391,7 @@ ProcessStartPreparedTransactionCommand(Port *myport, StringInfo message, bool is
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling start_prepared_transaction() for standby GTM %p.",
+ elog(DEBUG1, "calling start_prepared_transaction() for standby GTM %p.",
GetMyThreadInfo->thr_conn->standby);
retry:
@@ -2340,7 +2406,7 @@ ProcessStartPreparedTransactionCommand(Port *myport, StringInfo message, bool is
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "start_prepared_transaction() rc=%d done.", _rc);
+ elog(DEBUG1, "start_prepared_transaction() rc=%d done.", _rc);
}
pq_beginmessage(&buf, 'S');
pq_sendint(&buf, TXN_START_PREPARED_RESULT, 4);
@@ -2412,7 +2478,7 @@ ProcessPrepareTransactionCommand(Port *myport, StringInfo message, bool is_backu
MemoryContextSwitchTo(oldContext);
- elog(LOG, "Preparing transaction id %u", gxid);
+ elog(DEBUG1, "Preparing transaction id %u", gxid);
if (!is_backup)
{
@@ -2422,7 +2488,7 @@ ProcessPrepareTransactionCommand(Port *myport, StringInfo message, bool is_backu
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "calling prepare_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
+ elog(DEBUG1, "calling prepare_transaction() for standby GTM %p.", GetMyThreadInfo->thr_conn->standby);
retry:
bkup_prepare_transaction(GetMyThreadInfo->thr_conn->standby, gxid);
@@ -2434,7 +2500,7 @@ ProcessPrepareTransactionCommand(Port *myport, StringInfo message, bool is_backu
if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
- elog(LOG, "prepare_transaction() GXID=%d done.", gxid);
+ elog(DEBUG1, "prepare_transaction() GXID=%d done.", gxid);
}
/* Respond to the client */
pq_beginmessage(&buf, 'S');
@@ -2591,7 +2657,15 @@ GTM_RestoreTxnInfo(FILE *ctlf, GlobalTransactionId next_gxid)
(!GlobalTransactionIdIsValid(next_gxid)))
next_gxid = InitialGXIDValue_Default;
else if (!GlobalTransactionIdIsValid(next_gxid))
+#ifdef XCP
+ {
+ /* Add in extra amount in case we had not gracefully stopped */
+ next_gxid = saved_gxid + CONTROL_INTERVAL;
+ ControlXid = next_gxid;
+ }
+#else
next_gxid = saved_gxid;
+#endif
}
else if (!GlobalTransactionIdIsValid(next_gxid))
next_gxid = InitialGXIDValue_Default;
diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c
index e03738725c..2205beb055 100644
--- a/src/gtm/main/main.c
+++ b/src/gtm/main/main.c
@@ -2,6 +2,11 @@
*
* main.c
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -72,6 +77,11 @@ int tcp_keepalives_count;
char *error_reporter;
char *status_reader;
bool isStartUp;
+#ifdef XCP
+GTM_MutexLock control_lock;
+char GTMControlFileTmp[GTM_MAX_PATH];
+#define GTM_CONTROL_FILE_TMP "gtm.control.tmp"
+#endif
/* If this is GTM or not */
/*
@@ -182,6 +192,9 @@ BaseInit()
CreateDataDirLockFile();
sprintf(GTMControlFile, "%s/%s", GTMDataDir, GTM_CONTROL_FILE);
+#ifdef XCP
+ sprintf(GTMControlFileTmp, "%s/%s", GTMDataDir, GTM_CONTROL_FILE_TMP);
+#endif
if (GTMLogFile == NULL)
{
GTMLogFile = (char *) malloc(GTM_MAX_PATH);
@@ -206,6 +219,9 @@ BaseInit()
fflush(stdout);
fflush(stderr);
}
+#ifdef XCP
+ GTM_MutexLockInit(&control_lock);
+#endif
}
static void
@@ -264,7 +280,6 @@ help(const char *progname)
printf(_(" -D directory GTM working directory\n"));
printf(_(" -l filename GTM server log file name \n"));
printf(_(" -c show server status, then exit\n"));
- printf(_(" -V, --version output version information, then exit\n"));
printf(_(" --help show this help, then exit\n"));
printf(_("\n"));
printf(_("Options for Standby mode:\n"));
@@ -281,6 +296,38 @@ gtm_status()
exit(0);
}
+#ifdef XCP
+/*
+ * Save control file info
+ */
+void
+SaveControlInfo(void)
+{
+ FILE *ctlf;
+
+ GTM_MutexLockAcquire(&control_lock);
+
+ ctlf = fopen(GTMControlFileTmp, "w");
+
+ if (ctlf == NULL)
+ {
+ fprintf(stderr, "Failed to create/open the control file\n");
+ fclose(ctlf);
+ GTM_MutexLockRelease(&control_lock);
+ return;
+ }
+
+ GTM_SaveTxnInfo(ctlf);
+ GTM_SaveSeqInfo(ctlf);
+ fclose(ctlf);
+
+ remove(GTMControlFile);
+ rename(GTMControlFileTmp, GTMControlFile);
+
+ GTM_MutexLockRelease(&control_lock);
+}
+#endif
+
int
main(int argc, char *argv[])
{
@@ -339,11 +386,6 @@ main(int argc, char *argv[])
help(argv[0]);
exit(0);
}
- if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
- {
- puts("gtm (Postgres-XC) " PGXC_VERSION);
- exit(0);
- }
}
ListenAddresses = strdup(GTM_DEFAULT_HOSTNAME);
@@ -591,11 +633,19 @@ main(int argc, char *argv[])
}
else
{
+#ifdef XCP
+ GTM_MutexLockAcquire(&control_lock);
+#endif
+
ctlf = fopen(GTMControlFile, "r");
GTM_RestoreTxnInfo(ctlf, next_gxid);
GTM_RestoreSeqInfo(ctlf);
if (ctlf)
fclose(ctlf);
+
+#ifdef XCP
+ GTM_MutexLockRelease(&control_lock);
+#endif
}
if (Recovery_IsStandby())
@@ -692,13 +742,13 @@ main(int argc, char *argv[])
elog(ERROR, "Failed to update the standby-GTM status as \"CONNECTED\".");
exit(1);
}
- elog(LOG, "Updating the standby-GTM status as \"CONNECTED\" succeeded.");
+ elog(DEBUG1, "Updating the standby-GTM status as \"CONNECTED\" succeeded.");
if (!gtm_standby_finish_startup())
{
elog(ERROR, "Failed to close the initial connection to the active-GTM.");
exit(1);
}
- elog(LOG, "Startup connection with the active-GTM closed.");
+ elog(DEBUG1, "Startup connection with the active-GTM closed.");
}
/*
@@ -782,7 +832,9 @@ ServerLoop(void)
if (GTMAbortPending)
{
+#ifndef XCP
FILE *ctlf;
+#endif
/*
* XXX We should do a clean shutdown here. For the time being, just
@@ -797,6 +849,9 @@ ServerLoop(void)
*/
GTM_SetShuttingDown();
+#ifdef XCP
+ SaveControlInfo();
+#else
ctlf = fopen(GTMControlFile, "w");
if (ctlf == NULL)
{
@@ -806,6 +861,7 @@ ServerLoop(void)
GTM_SaveTxnInfo(ctlf);
GTM_SaveSeqInfo(ctlf);
+#endif
#if 0
/*
@@ -821,7 +877,9 @@ ServerLoop(void)
}
#endif
+#ifndef XCP
fclose(ctlf);
+#endif
exit(1);
}
@@ -1002,6 +1060,7 @@ GTM_ThreadMain(void *argp)
pq_getmsgend(&inBuf);
GTM_RegisterPGXCNode(thrinfo->thr_conn->con_port, sp.sp_node_name);
+
thrinfo->thr_conn->con_port->remote_type = sp.sp_remotetype;
thrinfo->thr_conn->con_port->is_postmaster = sp.sp_ispostmaster;
}
@@ -1213,6 +1272,9 @@ ProcessCommand(Port *myport, StringInfo input_message)
case MSG_NODE_UNREGISTER:
case MSG_BKUP_NODE_UNREGISTER:
case MSG_NODE_LIST:
+#ifdef XCP
+ case MSG_REGISTER_SESSION:
+#endif
ProcessPGXCNodeCommand(myport, mtype, input_message);
break;
case MSG_BEGIN_BACKUP:
@@ -1260,6 +1322,7 @@ ProcessCommand(Port *myport, StringInfo input_message)
case MSG_SEQUENCE_INIT:
case MSG_BKUP_SEQUENCE_INIT:
+ case MSG_SEQUENCE_GET_CURRENT:
case MSG_SEQUENCE_GET_NEXT:
case MSG_BKUP_SEQUENCE_GET_NEXT:
case MSG_SEQUENCE_GET_LAST:
@@ -1449,6 +1512,12 @@ ProcessPGXCNodeCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
ProcessPGXCNodeList(myport, message);
break;
+#ifdef XCP
+ case MSG_REGISTER_SESSION:
+ ProcessPGXCRegisterSession(myport, message);
+ break;
+#endif
+
default:
Assert(0); /* Shouldn't come here.. keep compiler quite */
}
@@ -1483,6 +1552,7 @@ ProcessTransactionCommand(Port *myport, GTM_MessageType mtype, StringInfo messag
case MSG_BKUP_TXN_BEGIN_GETGXID:
ProcessBkupBeginTransactionGetGXIDCommand(myport, message);
+ break;
case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM:
ProcessBeginTransactionGetGXIDAutovacuumCommand(myport, message);
@@ -1627,6 +1697,10 @@ ProcessSequenceCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
ProcessSequenceAlterCommand(myport, message, true);
break;
+ case MSG_SEQUENCE_GET_CURRENT:
+ ProcessSequenceGetCurrentCommand(myport, message);
+ break;
+
case MSG_SEQUENCE_GET_NEXT:
ProcessSequenceGetNextCommand(myport, message, false);
break;
@@ -1692,13 +1766,15 @@ ProcessQueryCommand(Port *myport, GTM_MessageType mtype, StringInfo message)
}
+
static void
GTM_RegisterPGXCNode(Port *myport, char *PGXCNodeName)
{
elog(DEBUG3, "Registering coordinator with name %s", PGXCNodeName);
- myport->node_name = strdup(PGXCNodeName);
+ myport->node_name = pstrdup(PGXCNodeName);
}
+
/*
* Validate the proposed data directory
*/
diff --git a/src/gtm/path/Makefile b/src/gtm/path/Makefile
index 4e6a159b19..186b3b1876 100644
--- a/src/gtm/path/Makefile
+++ b/src/gtm/path/Makefile
@@ -11,18 +11,21 @@ top_builddir=../../..
include $(top_builddir)/src/Makefile.global
subdir=src/gtm/path
-include $(top_srcdir)/src/backend/common.mk
+NAME=gtmpath
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
-OBJS = path.o
+OBJS=path.o
-all: libgtmpath.a
+all:all-lib
-libgtmpath.a: $(OBJS)
- $(AR) $(AROPT) $@ $^
+include $(top_srcdir)/src/Makefile.shlib
clean:
- rm -f $(OBJS) libgtmpath.a
+ rm -f $(OBJS)
+ rm -f libgtmpath.a libgtmpath.so libgtmpath.so.1 libgtmpath.so.1.0
distclean: clean
maintainer-clean: distclean
+
diff --git a/src/gtm/proxy/Makefile b/src/gtm/proxy/Makefile
index f6b0b1e335..c1ab2018d8 100644
--- a/src/gtm/proxy/Makefile
+++ b/src/gtm/proxy/Makefile
@@ -15,38 +15,23 @@ ifneq ($(PORTNAME), win32)
override CFLAGS += $(PTHREAD_CFLAGS)
endif
-SUBDIRS = $(top_builddir)/src/gtm/client \
- $(top_builddir)/src/gtm/common \
- $(top_builddir)/src/gtm/config \
- $(top_builddir)/src/gtm/libpq \
- $(top_builddir)/src/gtm/path \
- $(top_builddir)/src/gtm/recovery
+OBJS=proxy_main.o proxy_thread.o proxy_utils.o gtm_proxy_opt.o
-include $(top_srcdir)/src/backend/common.mk
+OTHERS= ../libpq/libpqcomm.a ../path/libgtmpath.a ../recovery/libgtmrecovery.a ../client/libgtmclient.a ../common/libgtm.a
-OBJS = $(SUBDIROBJS) \
- $(top_builddir)/src/port/libpgport_srv.a \
- $(top_builddir)/src/port/pgsleep.o \
- proxy_main.o proxy_thread.o proxy_utils.o gtm_proxy_opt.o
+LDFLAGS=-L$(top_builddir)/common -L$(top_builddir)/libpq
-LIBS += $(PTHREAD_LIBS)
+LIBS=-lpthread
-all: gtm_proxy
+gtm_proxy:$(OBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $^ $(OTHERS) ../../port/libpgport_srv.a -o gtm_proxy
-gtm_proxy: $(OBJS) | submake-libpgport
- $(CC) $(CFLAGS) $(LDFLAGS) $(LIBS) $(call expand_subsys,$^) -o $@
+all:gtm_proxy
-install: all installdirs
- $(INSTALL_PROGRAM) gtm_proxy$(X) '$(DESTDIR)$(bindir)/gtm_proxy$(X)'
- $(INSTALL_DATA) $(srcdir)/gtm_proxy.conf.sample '$(DESTDIR)$(datadir)/gtm_proxy.conf.sample'
+clean:
+ rm -f $(OBJS)
+ rm -f gtm_proxy
-installdirs:
- $(MKDIR_P) '$(DESTDIR)$(bindir)' '$(DESTDIR)$(datadir)'
+distclean: clean
-uninstall:
- rm -f '$(DESTDIR)$(bindir)/gtm_proxy$(X)' '$(DESTDIR)$(datadir)/gtm_proxy.conf.sample'
-
-clean distclean maintainer-clean:
- rm -f gtm_proxy$(X) $(OBJS)
-
-$(top_builddir)/src/port/libpgport_srv.a: | submake-libpgport
+maintainer-clean: distclean
diff --git a/src/gtm/proxy/gtm_proxy_opt.c b/src/gtm/proxy/gtm_proxy_opt.c
index 58e2beb7c2..96be9b56fc 100644
--- a/src/gtm/proxy/gtm_proxy_opt.c
+++ b/src/gtm/proxy/gtm_proxy_opt.c
@@ -158,7 +158,7 @@ struct config_int ConfigureNamesInt[] =
0
},
&GTMProxyPortNumber,
- 6666, 0, INT_MAX,
+ 0, 0, INT_MAX,
0, NULL
},
{
@@ -169,7 +169,7 @@ struct config_int ConfigureNamesInt[] =
0
},
&GTMServerPortNumber,
- 6666, 0, INT_MAX,
+ 0, 0, INT_MAX,
0, NULL
},
{
diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c
index 0285f26d40..b3f4649cb4 100644
--- a/src/gtm/proxy/proxy_main.c
+++ b/src/gtm/proxy/proxy_main.c
@@ -2,6 +2,11 @@
*
* proxy_main.c
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -177,7 +182,7 @@ static void SetDataDir(void);
static void ChangeToDataDir(void);
static void checkDataDir(void);
static void DeleteLockFile(const char *filename);
-static void RegisterProxy(bool is_reconnect, bool is_retry);
+static void RegisterProxy(bool is_reconnect);
static void UnregisterProxy(void);
static GTM_Conn *ConnectGTM(void);
static void ReleaseCmdBackup(GTMProxy_CommandInfo *cmdinfo);
@@ -260,7 +265,7 @@ BaseInit()
Recovery_SaveRegisterFileName(GTMProxyDataDir);
/* Register Proxy on GTM */
- RegisterProxy(false, false);
+ RegisterProxy(false);
DebugFileOpen();
@@ -395,7 +400,7 @@ GTMProxy_SigleHandler(int signal)
{
int ii;
- elog(LOG, "Received signal %d\n", signal);
+ elog(DEBUG1, "Received signal %d\n", signal);
switch (signal)
{
@@ -414,11 +419,11 @@ GTMProxy_SigleHandler(int signal)
* The mask is set to block signals. They're blocked until all the
* threads reconnect to the new GTM.
*/
- elog(LOG, "Accepted SIGUSR1\n");
+ elog(DEBUG1, "Accepted SIGUSR1\n");
if (MyThreadID != TopMostThreadID)
{
- elog(LOG, "Not on main thread, proxy the signal to the main thread.");
+ elog(DEBUG1, "Not on main thread, proxy the signal to the main thread.");
pthread_kill(TopMostThreadID, SIGUSR1);
return;
@@ -428,18 +433,18 @@ GTMProxy_SigleHandler(int signal)
*/
PG_SETMASK(&BlockSig);
- elog(LOG, "I'm the main thread. Accepted SIGUSR1.");
+ elog(DEBUG1, "I'm the main thread. Accepted SIGUSR1.");
/*
* Set Reconnect Info
*/
if (!ReadyToReconnect)
{
- elog(LOG, "SIGUSR1 detected, but not ready to handle this. Ignored");
+ elog(DEBUG1, "SIGUSR1 detected, but not ready to handle this. Ignored");
PG_SETMASK(&UnBlockSig);
return;
}
- elog(LOG, "SIGUSR1 detected. Set reconnect info for each worker thread");
+ elog(DEBUG1, "SIGUSR1 detected. Set reconnect info for each worker thread");
if (GTMProxy_ReadReconnectInfo() != 0)
{
/* Failed to read reconnect information from reconnect data file */
@@ -477,7 +482,7 @@ GTMProxy_SigleHandler(int signal)
for (ii = 0; ii < GTMProxyWorkerThreads; ii++)
pthread_kill(Proxy_ThreadInfo[ii]->thr_id, SIGUSR2);
- elog(LOG, "SIGUSR2 issued to all the worker threads.");
+ elog(DEBUG1, "SIGUSR2 issued to all the worker threads.");
PG_SETMASK(&UnBlockSig);
/*
@@ -490,13 +495,13 @@ GTMProxy_SigleHandler(int signal)
/* Main thread has nothing to do twith this signal and should not receive this. */
PG_SETMASK(&BlockSig);
- elog(LOG, "Detected SIGUSR2, thread:%ld", MyThreadID);
+ elog(DEBUG1, "Detected SIGUSR2, thread:%ld", MyThreadID);
if (MyThreadID == TopMostThreadID)
{
/* This should not be reached. Just in case. */
- elog(LOG, "SIGUSR2 received by the main thread. Ignoring.");
+ elog(DEBUG1, "SIGUSR2 received by the main thread. Ignoring.");
PG_SETMASK(&UnBlockSig);
return;
@@ -549,7 +554,6 @@ help(const char *progname)
printf(_(" -n count Number of worker threads\n"));
printf(_(" -D directory GTM proxy working directory\n"));
printf(_(" -l filename GTM proxy log file name \n"));
- printf(_(" -V, --version output version information, then exit\n"));
printf(_(" --help show this help, then exit\n"));
}
@@ -590,11 +594,6 @@ main(int argc, char *argv[])
help(argv[0]);
exit(0);
}
- if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
- {
- puts("gtm_proxy (Postgres-XC) " PGXC_VERSION);
- exit(0);
- }
}
ListenAddresses = strdup(GTM_PROXY_DEFAULT_HOSTNAME);
@@ -961,9 +960,9 @@ ServerLoop(void)
* the resource but this may not happen so many times.
*/
- elog(LOG, "Main Thread reconnecting to new GTM.");
- RegisterProxy(TRUE, false);
- elog(LOG, "Reconnected.");
+ elog(DEBUG1, "Main Thread reconnecting to new GTM.");
+ RegisterProxy(TRUE);
+ elog(DEBUG1, "Reconnected.");
/* If it is done, then release the lock for worker threads. */
GTM_RWLockRelease(&ReconnectControlLock);
@@ -1015,7 +1014,7 @@ ServerLoop(void)
{
if (errno != EINTR && errno != EWOULDBLOCK)
{
- ereport(LOG,
+ ereport(DEBUG1,
(EACCES,
errmsg("select() failed in postmaster: %m")));
return STATUS_ERROR;
@@ -1094,6 +1093,7 @@ GTMProxy_ThreadMain(void *argp)
int ii, nrfds;
char gtm_connect_string[1024];
int first_turn = TRUE; /* Used only to set longjmp target at the first turn of thread loop */
+ GTMProxy_CommandData cmd_data = {};
elog(DEBUG3, "Starting the connection helper thread");
@@ -1348,7 +1348,6 @@ setjmp_again:
/*
* Correction of pending works.
*/
- thrinfo->thr_processed_commands = gtm_NIL;
for (ii = 0; ii < MSG_TYPE_COUNT; ii++)
{
thrinfo->thr_pending_commands[ii] = gtm_NIL;
@@ -1387,7 +1386,8 @@ setjmp_again:
* to the remove_list and cleanup at the end of this round of
* cleanup.
*/
- GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn);
+ GTMProxy_CommandPending(thrinfo->thr_conn,
+ MSG_BACKEND_DISCONNECT, cmd_data);
continue;
}
@@ -1421,7 +1421,8 @@ setjmp_again:
* to the server to quickly find the backend connection
* while processing proxied messages.
*/
- GTMProxy_HandleDisconnect(thrinfo->thr_conn, thrinfo->thr_gtm_conn);
+ GTMProxy_CommandPending(thrinfo->thr_conn,
+ MSG_BACKEND_DISCONNECT, cmd_data);
break;
default:
/*
@@ -1565,6 +1566,9 @@ ProcessCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
{
case MSG_NODE_REGISTER:
case MSG_NODE_UNREGISTER:
+#ifdef XCP
+ case MSG_REGISTER_SESSION:
+#endif
ProcessPGXCNodeCommand(conninfo, gtm_conn, mtype, input_message);
break;
@@ -1587,6 +1591,7 @@ ProcessCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
break;
case MSG_SEQUENCE_INIT:
+ case MSG_SEQUENCE_GET_CURRENT:
case MSG_SEQUENCE_GET_NEXT:
case MSG_SEQUENCE_GET_LAST:
case MSG_SEQUENCE_SET_VAL:
@@ -1882,8 +1887,12 @@ ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo,
case MSG_TXN_GET_GID_DATA:
case MSG_NODE_REGISTER:
case MSG_NODE_UNREGISTER:
+#ifdef XCP
+ case MSG_REGISTER_SESSION:
+#endif
case MSG_SNAPSHOT_GXID_GET:
case MSG_SEQUENCE_INIT:
+ case MSG_SEQUENCE_GET_CURRENT:
case MSG_SEQUENCE_GET_NEXT:
case MSG_SEQUENCE_GET_LAST:
case MSG_SEQUENCE_SET_VAL:
@@ -2160,6 +2169,16 @@ ProcessPGXCNodeCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
/* Unregistering has to be saved in a place where it can be seen by all the threads */
oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+#ifdef XCP
+ /*
+ * Unregister node. Ignore any error here, otherwise we enter
+ * endless loop trying to execute command again and again
+ */
+ Recovery_PGXCNodeUnregister(cmd_data.cd_reg.type,
+ cmd_data.cd_reg.nodename,
+ false,
+ conninfo->con_port->sock);
+#else
/* Unregister Node also on Proxy */
if (Recovery_PGXCNodeUnregister(cmd_data.cd_reg.type,
cmd_data.cd_reg.nodename,
@@ -2170,12 +2189,17 @@ ProcessPGXCNodeCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn,
(EINVAL,
errmsg("Failed to Unregister node")));
}
-
+#endif
MemoryContextSwitchTo(oldContext);
GTMProxy_ProxyPGXCNodeCommand(conninfo, gtm_conn, mtype, cmd_data);
break;
}
+#ifdef XCP
+ case MSG_REGISTER_SESSION:
+ GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message);
+ break;
+#endif
default:
Assert(0); /* Shouldn't come here.. Keep compiler quiet */
}
@@ -2439,6 +2463,10 @@ GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo, GTM_MessageType mtype
GTMProxy_CommandInfo *cmdinfo;
GTMProxy_ThreadInfo *thrinfo = GetMyThreadInfo;
+#ifdef XCP
+ MemoryContext oldContext = MemoryContextSwitchTo(TopMemoryContext);
+#endif
+
/*
* Add the message to the pending command list
*/
@@ -2449,6 +2477,10 @@ GTMProxy_CommandPending(GTMProxy_ConnectionInfo *conninfo, GTM_MessageType mtype
cmdinfo->ci_data = cmd_data;
thrinfo->thr_pending_commands[mtype] = gtm_lappend(thrinfo->thr_pending_commands[mtype], cmdinfo);
+#ifdef XCP
+ MemoryContextSwitchTo(oldContext);
+#endif
+
return;
}
@@ -2529,7 +2561,8 @@ GTMProxy_HandleDisconnect(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn)
/* Mark node as disconnected if it is a postmaster backend */
Recovery_PGXCNodeDisconnect(conninfo->con_port);
- /* Start the message. */
+ proxyhdr.ph_conid = conninfo->con_id;
+ /* Start the message. */
if (gtmpqPutMsgStart('C', true, gtm_conn) ||
gtmpqPutnchar((char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader), gtm_conn) ||
gtmpqPutInt(MSG_BACKEND_DISCONNECT, sizeof (GTM_MessageType), gtm_conn) ||
@@ -2559,8 +2592,6 @@ GTMProxy_HandleDisconnect(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn)
ConnFree(conninfo->con_port);
conninfo->con_port = NULL;
- proxyhdr.ph_conid = conninfo->con_id;
-
return;
}
@@ -2580,7 +2611,9 @@ GTMProxy_ProcessPendingCommands(GTMProxy_ThreadInfo *thrinfo)
{
int res_index = 0;
- if (gtm_list_length(thrinfo->thr_pending_commands[ii]) == 0)
+ /* We process backend disconnects last! */
+ if (ii == MSG_BACKEND_DISCONNECT ||
+ gtm_list_length(thrinfo->thr_pending_commands[ii]) == 0)
continue;
/*
@@ -2755,7 +2788,15 @@ GTMProxy_ProcessPendingCommands(GTMProxy_ThreadInfo *thrinfo)
default:
elog(ERROR, "This message type (%d) can not be grouped together", ii);
}
-
+ }
+ /* Process backend disconnect messages now */
+ gtm_foreach (elem, thrinfo->thr_pending_commands[MSG_BACKEND_DISCONNECT])
+ {
+ ereport(COMMERROR,
+ (EPROTO,
+ errmsg("cleaning up client disconnection")));
+ cmdinfo = (GTMProxy_CommandInfo *)gtm_lfirst(elem);
+ GTMProxy_HandleDisconnect(cmdinfo->ci_conn, gtm_conn);
}
}
@@ -3151,7 +3192,7 @@ failed:
* NewGTMServerPortNumber.
*/
static void
-RegisterProxy(bool is_reconnect, bool is_retry)
+RegisterProxy(bool is_reconnect)
{
GTM_PGXCNodeType type = GTM_NODE_GTM_PROXY;
GTM_PGXCNodePort port = (GTM_PGXCNodePort) GTMProxyPortNumber;
@@ -3236,14 +3277,7 @@ RegisterProxy(bool is_reconnect, bool is_retry)
return;
failed:
- if (!is_retry)
- {
- elog(NOTICE, "could not register Proxy on GTM. Trying to unregister myself and then retry.");
- UnregisterProxy();
- return RegisterProxy(is_reconnect, true);
- }
- else
- elog(ERROR, "can not register Proxy on GTM");
+ elog(ERROR, "can not register Proxy on GTM");
}
static GTM_Conn*
@@ -3299,26 +3333,18 @@ workerThreadReconnectToGTM(void)
PG_SETMASK(&UnBlockSig);
/* Disconnect the current connection and re-connect to the new GTM */
- /*
- * Because some error is expected, it is harmful to close GTM connection in
- * normal way. Instead, just close the socket to save kernel resource.
- *
- * This is error recovery and we should be very careful what structure is
- * available.
- */
oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
- if (GetMyThreadInfo && GetMyThreadInfo->thr_gtm_conn && GetMyThreadInfo->thr_gtm_conn->sock != -1)
- StreamClose(GetMyThreadInfo->thr_gtm_conn->sock);
-
+ if (GetMyThreadInfo->thr_gtm_conn)
+ GTMPQfinish(GetMyThreadInfo->thr_gtm_conn);
sprintf(gtm_connect_string, "host=%s port=%d node_name=%s remote_type=%d",
GTMServerHost, GTMServerPortNumber, GTMProxyNodeName, GTM_NODE_GTM_PROXY);
- elog(LOG, "Worker thread connecting to %s", gtm_connect_string);
+ elog(DEBUG1, "Worker thread connecting to %s", gtm_connect_string);
GetMyThreadInfo->thr_gtm_conn = PQconnectGTM(gtm_connect_string);
if (GetMyThreadInfo->thr_gtm_conn == NULL)
elog(FATAL, "Worker thread GTM connection failed.");
- elog(LOG, "Worker thread connection done.");
+ elog(DEBUG1, "Worker thread connection done.");
MemoryContextSwitchTo(oldContext);
diff --git a/src/gtm/proxy/proxy_thread.c b/src/gtm/proxy/proxy_thread.c
index ad8d155c4f..4247be2d69 100644
--- a/src/gtm/proxy/proxy_thread.c
+++ b/src/gtm/proxy/proxy_thread.c
@@ -254,7 +254,7 @@ GTMProxy_ThreadCleanup(void *argp)
{
GTMProxy_ThreadInfo *thrinfo = (GTMProxy_ThreadInfo *)argp;
- elog(LOG, "Cleaning up thread state");
+ elog(DEBUG1, "Cleaning up thread state");
/*
* TODO Close the open connection.
diff --git a/src/gtm/recovery/Makefile b/src/gtm/recovery/Makefile
index f604d2bb65..e98e0f69fd 100644
--- a/src/gtm/recovery/Makefile
+++ b/src/gtm/recovery/Makefile
@@ -11,17 +11,21 @@ top_builddir=../../..
include $(top_builddir)/src/Makefile.global
subdir=src/gtm/recovery
-include $(top_srcdir)/src/backend/common.mk
+NAME=gtmrecovery
+SO_MAJOR_VERSION= 1
+SO_MINOR_VERSION= 0
-OBJS = register_common.o standby_utils.o
+OBJS=register_common.o register_gtm.o replication.o standby_utils.o
-all: libgtmrecovery.a
+OTHERS=../client/libgtmclient.a
-libgtmrecovery.a: $(OBJS)
- $(AR) $(AROPT) $@ $^
+all:all-lib
+
+include $(top_srcdir)/src/Makefile.shlib
clean:
- rm -f $(OBJS) libgtmrecovery.a
+ rm -f $(OBJS)
+ rm -f libgtmrecovery.a libgtmrecovery.so libgtmrecovery.so.1 libgtmrecovery.so.1.0
distclean: clean
diff --git a/src/gtm/recovery/register_common.c b/src/gtm/recovery/register_common.c
index f8f98ec8ac..4de2299b47 100644
--- a/src/gtm/recovery/register_common.c
+++ b/src/gtm/recovery/register_common.c
@@ -3,6 +3,11 @@
* register.c
* PGXC Node Register on GTM and GTM Proxy, node registering functions
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -31,6 +36,7 @@
#include "gtm/register.h"
#include "gtm/gtm_ip.h"
+#include "storage/backendid.h"
#define GTM_NODE_FILE "register.node"
#define NODE_HASH_TABLE_SIZE 16
@@ -120,7 +126,7 @@ pgxcnode_find_by_type(GTM_PGXCNodeType type, GTM_PGXCNodeInfo **data, size_t max
if (cur != NULL && cur->type == type)
{
data[node] = cur;
- elog(LOG, "pgxcnode_find_by_type: cur=%p, ipaddress=%s", cur, cur->ipaddress);
+ elog(DEBUG1, "pgxcnode_find_by_type: cur=%p, ipaddress=%s", cur, cur->ipaddress);
node++;
}
@@ -349,8 +355,18 @@ Recovery_PGXCNodeUnregister(GTM_PGXCNodeType type, char *node_name, bool in_reco
Recovery_RecordRegisterInfo(nodeinfo, false);
pfree(nodeinfo->nodename);
+#ifdef XCP
+ if (nodeinfo->ipaddress)
+#endif
pfree(nodeinfo->ipaddress);
+#ifdef XCP
+ if (nodeinfo->datafolder)
+#endif
pfree(nodeinfo->datafolder);
+#ifdef XCP
+ if (nodeinfo->sessions)
+ pfree(nodeinfo->sessions);
+#endif
pfree(nodeinfo);
}
else
@@ -373,7 +389,11 @@ Recovery_PGXCNodeRegister(GTM_PGXCNodeType type,
GTM_PGXCNodeInfo *nodeinfo = NULL;
int errcode = 0;
+#ifdef XCP
+ nodeinfo = (GTM_PGXCNodeInfo *) palloc0(sizeof(GTM_PGXCNodeInfo));
+#else
nodeinfo = (GTM_PGXCNodeInfo *) palloc(sizeof (GTM_PGXCNodeInfo));
+#endif
if (nodeinfo == NULL)
ereport(ERROR, (ENOMEM, errmsg("Out of memory")));
@@ -394,10 +414,10 @@ Recovery_PGXCNodeRegister(GTM_PGXCNodeType type,
nodeinfo->status = status;
nodeinfo->socket = socket;
- elog(LOG, "Recovery_PGXCNodeRegister Request info: type=%d, nodename=%s, port=%d," \
+ elog(DEBUG1, "Recovery_PGXCNodeRegister Request info: type=%d, nodename=%s, port=%d," \
"datafolder=%s, ipaddress=%s, status=%d",
type, nodename, port, datafolder, ipaddress, status);
- elog(LOG, "Recovery_PGXCNodeRegister Node info: type=%d, nodename=%s, port=%d, "\
+ elog(DEBUG1, "Recovery_PGXCNodeRegister Node info: type=%d, nodename=%s, port=%d, "\
"datafolder=%s, ipaddress=%s, status=%d",
nodeinfo->type, nodeinfo->nodename, nodeinfo->port,
nodeinfo->datafolder, nodeinfo->ipaddress, nodeinfo->status);
@@ -633,6 +653,7 @@ Recovery_RecordRegisterInfo(GTM_PGXCNodeInfo *nodeinfo, bool is_register)
void
Recovery_RestoreRegisterInfo(void)
{
+#ifndef XCP
int magic;
int ctlfd;
@@ -701,6 +722,7 @@ Recovery_RestoreRegisterInfo(void)
}
close(ctlfd);
+#endif
}
void
@@ -785,6 +807,94 @@ Recovery_PGXCNodeBackendDisconnect(GTM_PGXCNodeType type, char *nodename, int so
return errcode;
}
+
+#ifdef XCP
+/*
+ * Register active distributed session. If another session with specified
+ * BackendId already exists return the PID of the session, so caller could clean
+ * it up. Otherwise return 0.
+ */
+int
+Recovery_PGXCNodeRegisterCoordProcess(char *coord_node, int coord_procid,
+ int coord_backendid)
+{
+ GTM_PGXCNodeInfo *nodeinfo;
+ GTM_PGXCSession *session;
+ int i;
+
+ /*
+ * Get the registration record for the coordinator node. If not specified,
+ * register it now.
+ */
+ nodeinfo = pgxcnode_find_info(GTM_NODE_COORDINATOR, coord_node);
+
+ if (nodeinfo == NULL)
+ {
+ if (Recovery_PGXCNodeRegister(GTM_NODE_COORDINATOR, coord_node, 0, NULL,
+ NODE_CONNECTED, NULL, NULL, false, 0))
+ return 0;
+
+ nodeinfo = pgxcnode_find_info(GTM_NODE_COORDINATOR, coord_node);
+ }
+
+ /* Iterate over the existing sessions */
+ GTM_RWLockAcquire(&nodeinfo->node_lock, GTM_LOCKMODE_WRITE);
+ for (i = 0; i < nodeinfo->num_sessions; i++)
+ {
+ if (nodeinfo->sessions[i].gps_coord_proc_id == coord_procid)
+ {
+ /*
+ * Already registered, nothing todo.
+ * May be session lost the GTM connection and now is reconnecting.
+ */
+ GTM_RWLockRelease(&nodeinfo->node_lock);
+ return 0;
+ }
+ if (nodeinfo->sessions[i].gps_coord_backend_id == coord_backendid)
+ {
+ /*
+ * Reuse the entry and return PID of the previous session.
+ */
+ int result = nodeinfo->sessions[i].gps_coord_proc_id;
+ elog(DEBUG1, "New session %s:%d with existing BackendId %d",
+ coord_node, coord_procid, coord_backendid);
+ nodeinfo->sessions[i].gps_coord_proc_id = coord_procid;
+ GTM_RWLockRelease(&nodeinfo->node_lock);
+ return result;
+ }
+ }
+ /* Session not found, populate new entry */
+ elog(DEBUG1, "New session %s:%d with BackendId %d",
+ coord_node, coord_procid, coord_backendid);
+ if (nodeinfo->num_sessions == nodeinfo->max_sessions)
+ {
+ /* need to extend array */
+#define INIT_SESSIONS 256
+ if (nodeinfo->max_sessions == 0)
+ {
+ nodeinfo->sessions = (GTM_PGXCSession *)
+ palloc(INIT_SESSIONS * sizeof(GTM_PGXCSession));
+ nodeinfo->max_sessions = INIT_SESSIONS;
+ }
+ else
+ {
+ int newsize = nodeinfo->max_sessions * 2;
+ nodeinfo->sessions = (GTM_PGXCSession *)
+ repalloc(nodeinfo->sessions,
+ newsize * sizeof(GTM_PGXCSession));
+ nodeinfo->max_sessions = newsize;
+ }
+ }
+ nodeinfo->sessions[nodeinfo->num_sessions].gps_coord_proc_id = coord_procid;
+ nodeinfo->sessions[nodeinfo->num_sessions].gps_coord_backend_id = coord_backendid;
+ nodeinfo->num_sessions++;
+ GTM_RWLockRelease(&nodeinfo->node_lock);
+
+ return 0;
+}
+#endif
+
+
/*
* Process MSG_BACKEND_DISCONNECT
*
@@ -847,7 +957,7 @@ ProcessPGXCNodeBackendDisconnect(Port *myport, StringInfo message)
GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
int count = 0;
- elog(LOG, "forwarding MSG_BACKEND_DISCONNECT to standby GTM %p.",
+ elog(DEBUG1, "forwarding MSG_BACKEND_DISCONNECT to standby GTM %p.",
GetMyThreadInfo->thr_conn->standby);
retry:
@@ -859,6 +969,6 @@ retry:
if (gtm_standby_check_communication_error(&count, oldconn))
goto retry;
- elog(LOG, "MSG_BACKEND_DISCONNECT rc=%d done.", _rc);
+ elog(DEBUG1, "MSG_BACKEND_DISCONNECT rc=%d done.", _rc);
}
}
diff --git a/src/gtm/recovery/register_gtm.c b/src/gtm/recovery/register_gtm.c
new file mode 100644
index 0000000000..bb7f433404
--- /dev/null
+++ b/src/gtm/recovery/register_gtm.c
@@ -0,0 +1,597 @@
+/*-------------------------------------------------------------------------
+ *
+ * register.c
+ * PGXC Node Register on GTM and GTM Proxy, node registering functions
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtm/elog.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_client.h"
+#include "gtm/gtm_serialize.h"
+#include "gtm/gtm_standby.h"
+#include "gtm/libpq.h"
+#include "gtm/libpq-int.h"
+#include "gtm/pqformat.h"
+#include "gtm/stringinfo.h"
+#include "gtm/register.h"
+
+#include "gtm/gtm_ip.h"
+
+#ifdef XCP
+#include "storage/backendid.h"
+#endif
+
+static void finishStandbyConn(GTM_ThreadInfo *thrinfo);
+extern bool Backup_synchronously;
+
+/*
+ * Process MSG_NODE_REGISTER/MSG_BKUP_NODE_REGISTER message.
+ *
+ * is_backup indicates the message is MSG_BKUP_NODE_REGISTER.
+ */
+void
+ProcessPGXCNodeRegister(Port *myport, StringInfo message, bool is_backup)
+{
+ GTM_PGXCNodeType type;
+ GTM_PGXCNodePort port;
+ char remote_host[NI_MAXHOST];
+ char datafolder[NI_MAXHOST];
+ char node_name[NI_MAXHOST];
+ char proxyname[NI_MAXHOST];
+ char *ipaddress;
+ MemoryContext oldContext;
+ int len;
+ StringInfoData buf;
+ GTM_PGXCNodeStatus status;
+
+ /* Read Node Type */
+ memcpy(&type, pq_getmsgbytes(message, sizeof (GTM_PGXCNodeType)),
+ sizeof (GTM_PGXCNodeType));
+
+ /* Read Node name */
+ len = pq_getmsgint(message, sizeof (int));
+ if (len >= NI_MAXHOST)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Invalid name length.")));
+
+ memcpy(node_name, (char *)pq_getmsgbytes(message, len), len);
+ node_name[len] = '\0';
+
+ /* Read Host name */
+ len = pq_getmsgint(message, sizeof (int));
+ memcpy(remote_host, (char *)pq_getmsgbytes(message, len), len);
+ remote_host[len] = '\0';
+ ipaddress = remote_host;
+
+ /* Read Port Number */
+ memcpy(&port, pq_getmsgbytes(message, sizeof (GTM_PGXCNodePort)),
+ sizeof (GTM_PGXCNodePort));
+
+ /* Read Proxy name (empty string if no proxy used) */
+ len = pq_getmsgint(message, sizeof (GTM_StrLen));
+ if (len >= NI_MAXHOST)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Invalid proxy name length.")));
+ memcpy(proxyname, (char *)pq_getmsgbytes(message, len), len);
+ proxyname[len] = '\0';
+
+ /*
+ * Finish by reading Data Folder (length and then string)
+ */
+ len = pq_getmsgint(message, sizeof (GTM_StrLen));
+
+ memcpy(datafolder, (char *)pq_getmsgbytes(message, len), len);
+ datafolder[len] = '\0';
+
+ elog(DEBUG1,
+ "ProcessPGXCNodeRegister: ipaddress = \"%s\", node name = \"%s\", proxy name = \"%s\", "
+ "datafolder \"%s\"",
+ ipaddress, node_name, proxyname, datafolder);
+
+ status = pq_getmsgint(message, sizeof (GTM_PGXCNodeStatus));
+
+ if ((type!=GTM_NODE_GTM_PROXY) &&
+ (type!=GTM_NODE_GTM_PROXY_POSTMASTER) &&
+ (type!=GTM_NODE_COORDINATOR) &&
+ (type!=GTM_NODE_DATANODE) &&
+ (type!=GTM_NODE_GTM) &&
+ (type!=GTM_NODE_DEFAULT))
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Unknown node type.")));
+
+ elog(DEBUG1, "Node type = %d", type);
+
+ /*
+ * We must use the TopMostMemoryContext because the Node ID information is
+ * not bound to a thread and can outlive any of the thread specific
+ * contextes.
+ */
+ oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+ /*
+ * We don't check if the this is not in standby mode to allow
+ * cascaded standby.
+ */
+ if (type == GTM_NODE_GTM)
+ {
+ elog(DEBUG1, "Registering GTM (Standby). Unregister this first.");
+ /*
+ * There's another standby. May be failed one.
+ * Clean this up. This means that we allow
+ * only one standby at the same time.
+ *
+ * This helps to give up failed standby and connect
+ * new one, regardless how they stopped.
+ *
+ * Be sure that all ther threads are locked by other
+ * means, typically by receiving MSG_BEGIN_BACKUP.
+ *
+ * First try to unregister GTM which is now connected. We don't care
+ * if it failed.
+ */
+ Recovery_PGXCNodeUnregister(type, node_name, false, -1);
+ /*
+ * Then disconnect the connections to the standby from each thread.
+ * Please note that we assume only one standby is allowed at the same time.
+ * Cascade standby may be allowed.
+ */
+ GTM_DoForAllOtherThreads(finishStandbyConn);
+
+ GTMThreads->gt_standby_ready = true;
+ }
+
+ if (Recovery_PGXCNodeRegister(type, node_name, port,
+ proxyname, NODE_CONNECTED,
+ ipaddress, datafolder, false, myport->sock))
+ {
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to Register node")));
+ }
+
+ /*
+ * We don't check if the this is not in standby mode to allow
+ * cascaded standby.
+ */
+ if (type == GTM_NODE_GTM)
+ GTMThreads->gt_standby_ready = true;
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_getmsgend(message);
+
+ if (!is_backup)
+ {
+ /*
+ * Backup first
+ */
+ if (GetMyThreadInfo->thr_conn->standby)
+ {
+ int _rc;
+ GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
+ int count = 0;
+ GTM_PGXCNodeInfo *standbynode;
+
+ elog(DEBUG1, "calling node_register_internal() for standby GTM %p.",
+ GetMyThreadInfo->thr_conn->standby);
+
+ retry:
+ _rc = bkup_node_register_internal(GetMyThreadInfo->thr_conn->standby,
+ type,
+ ipaddress,
+ port,
+ node_name,
+ datafolder,
+ status);
+
+ elog(DEBUG1, "node_register_internal() returns rc %d.", _rc);
+
+ if (gtm_standby_check_communication_error(&count, oldconn))
+ goto retry;
+
+ /* Now check if there're other standby registered. */
+ standbynode = find_standby_node_info();
+ if (!standbynode)
+ GTMThreads->gt_standby_ready = false;
+
+ if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
+ gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
+
+ }
+ /*
+ * Then, send a SUCCESS message back to the client
+ */
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, NODE_REGISTER_RESULT, 4);
+ if (myport->remote_type == GTM_NODE_GTM_PROXY)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&type, sizeof(GTM_PGXCNodeType));
+ /* Node name length */
+ pq_sendint(&buf, strlen(node_name), 4);
+ /* Node name (var-len) */
+ pq_sendbytes(&buf, node_name, strlen(node_name));
+ pq_endmessage(myport, &buf);
+
+ if (myport->remote_type != GTM_NODE_GTM_PROXY)
+ {
+ if (GetMyThreadInfo->thr_conn->standby)
+ gtmpqFlush(GetMyThreadInfo->thr_conn->standby);
+ pq_flush(myport);
+ }
+ }
+}
+
+
+/*
+ * Process MSG_NODE_UNREGISTER/MSG_BKUP_NODE_UNREGISTER
+ *
+ * is_backup indiccates MSG_BKUP_NODE_UNREGISTER
+ */
+void
+ProcessPGXCNodeUnregister(Port *myport, StringInfo message, bool is_backup)
+{
+ GTM_PGXCNodeType type;
+ MemoryContext oldContext;
+ StringInfoData buf;
+ int len;
+ char node_name[NI_MAXHOST];
+
+ /* Read Node Type and number */
+ memcpy(&type, pq_getmsgbytes(message, sizeof (GTM_PGXCNodeType)),
+ sizeof (GTM_PGXCNodeType));
+
+ /* Read Node name */
+ len = pq_getmsgint(message, sizeof (int));
+ if (len >= NI_MAXHOST)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Invalid node name length")));
+ memcpy(node_name, (char *)pq_getmsgbytes(message, len), len);
+ node_name[len] = '\0';
+
+ /*
+ * We must use the TopMostMemoryContext because the Node ID information is
+ * not bound to a thread and can outlive any of the thread specific
+ * contextes.
+ */
+ oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+ if (Recovery_PGXCNodeUnregister(type, node_name, false, myport->sock))
+ {
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Failed to Unregister node")));
+ }
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_getmsgend(message);
+
+
+ if (!is_backup)
+ {
+ /*
+ * Backup first
+ */
+ if (GetMyThreadInfo->thr_conn->standby)
+ {
+ int _rc;
+ GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
+ int count = 0;
+
+ elog(DEBUG1, "calling node_unregister() for standby GTM %p.",
+ GetMyThreadInfo->thr_conn->standby);
+
+ retry:
+ _rc = bkup_node_unregister(GetMyThreadInfo->thr_conn->standby,
+ type,
+ node_name);
+
+
+ if (gtm_standby_check_communication_error(&count, oldconn))
+ goto retry;
+
+ if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
+ gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
+
+ elog(DEBUG1, "node_unregister() returns rc %d.", _rc);
+ }
+ /*
+ * Send a SUCCESS message back to the client
+ */
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, NODE_UNREGISTER_RESULT, 4);
+ if (myport->remote_type == GTM_NODE_GTM_PROXY)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendbytes(&buf, (char *)&type, sizeof(GTM_PGXCNodeType));
+ /* Node name length */
+ pq_sendint(&buf, strlen(node_name), 4);
+ /* Node name (var-len) */
+ pq_sendbytes(&buf, node_name, strlen(node_name));
+
+ pq_endmessage(myport, &buf);
+
+ /* Flush standby before flush to the client */
+ if (myport->remote_type != GTM_NODE_GTM_PROXY)
+ {
+ if (GetMyThreadInfo->thr_conn->standby)
+ gtmpqFlush(GetMyThreadInfo->thr_conn->standby);
+ pq_flush(myport);
+ }
+ }
+}
+
+/*
+ * Process MSG_NODE_LIST
+ */
+void
+ProcessPGXCNodeList(Port *myport, StringInfo message)
+{
+ MemoryContext oldContext;
+ StringInfoData buf;
+ int num_node = 13;
+ int i;
+
+ GTM_PGXCNodeInfo *data[MAX_NODES];
+ char *s_data[MAX_NODES];
+ size_t s_datalen[MAX_NODES];
+
+ /*
+ * We must use the TopMostMemoryContext because the Node ID information is
+ * not bound to a thread and can outlive any of the thread specific
+ * contextes.
+ */
+ oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+ memset(data, 0, sizeof(GTM_PGXCNodeInfo *) * MAX_NODES);
+ memset(s_data, 0, sizeof(char *) * MAX_NODES);
+
+ num_node = pgxcnode_get_all(data, MAX_NODES);
+
+ for (i = 0; i < num_node; i++)
+ {
+ size_t s_len;
+
+ s_len = gtm_get_pgxcnodeinfo_size(data[i]);
+
+ /*
+ * Allocate memory blocks for serialized GTM_PGXCNodeInfo data.
+ */
+ s_data[i] = (char *)malloc(s_len+1);
+ memset(s_data[i], 0, s_len+1);
+
+ s_datalen[i] = gtm_serialize_pgxcnodeinfo(data[i], s_data[i], s_len+1);
+
+ elog(DEBUG1, "gtm_get_pgxcnodeinfo_size: s_len=%ld, s_datalen=%ld", s_len, s_datalen[i]);
+ }
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_getmsgend(message);
+
+ /*
+ * Send a SUCCESS message back to the client
+ */
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, NODE_LIST_RESULT, 4);
+ if (myport->remote_type == GTM_NODE_GTM_PROXY)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_sendint(&buf, num_node, sizeof(int)); /* number of nodes */
+
+ /*
+ * Send pairs of GTM_PGXCNodeInfo size and serialized GTM_PGXCNodeInfo body.
+ */
+ for (i = 0; i < num_node; i++)
+ {
+ pq_sendint(&buf, s_datalen[i], sizeof(int));
+ pq_sendbytes(&buf, s_data[i], s_datalen[i]);
+ }
+
+ pq_endmessage(myport, &buf);
+
+ if (myport->remote_type != GTM_NODE_GTM_PROXY)
+ pq_flush(myport);
+
+ /*
+ * Release memory blocks for the serialized data.
+ */
+ for (i = 0; i < num_node; i++)
+ {
+ free(s_data[i]);
+ }
+
+ elog(DEBUG1, "ProcessPGXCNodeList() ok.");
+}
+
+void
+ProcessGTMBeginBackup(Port *myport, StringInfo message)
+{
+ int ii;
+ GTM_ThreadInfo *my_threadinfo;
+ StringInfoData buf;
+
+ pq_getmsgend(message);
+ my_threadinfo = GetMyThreadInfo;
+
+ for (ii = 0; ii < GTMThreads->gt_array_size; ii++)
+ {
+ if (GTMThreads->gt_threads[ii] && GTMThreads->gt_threads[ii] != my_threadinfo)
+ GTM_RWLockAcquire(&GTMThreads->gt_threads[ii]->thr_lock, GTM_LOCKMODE_WRITE);
+ }
+ my_threadinfo->thr_status = GTM_THREAD_BACKUP;
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, BEGIN_BACKUP_RESULT, 4);
+ pq_endmessage(myport, &buf);
+ pq_flush(myport);
+}
+
+void
+ProcessGTMEndBackup(Port *myport, StringInfo message)
+{
+ int ii;
+ GTM_ThreadInfo *my_threadinfo;
+ StringInfoData buf;
+
+ pq_getmsgend(message);
+ my_threadinfo = GetMyThreadInfo;
+
+ for (ii = 0; ii < GTMThreads->gt_array_size; ii++)
+ {
+ if (GTMThreads->gt_threads[ii] && GTMThreads->gt_threads[ii] != my_threadinfo)
+ GTM_RWLockRelease(&GTMThreads->gt_threads[ii]->thr_lock);
+ }
+ my_threadinfo->thr_status = GTM_THREAD_RUNNING;
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, END_BACKUP_RESULT, 4);
+ pq_endmessage(myport, &buf);
+ pq_flush(myport);
+}
+
+
+static void
+finishStandbyConn(GTM_ThreadInfo *thrinfo)
+{
+ if ((thrinfo->thr_conn != NULL) && (thrinfo->thr_conn->standby != NULL))
+ {
+ GTMPQfinish(thrinfo->thr_conn->standby);
+ thrinfo->thr_conn->standby = NULL;
+ }
+}
+
+
+#ifdef XCP
+/*
+ * Process MSG_REGISTER_SESSION message
+ */
+void
+ProcessPGXCRegisterSession(Port *myport, StringInfo message)
+{
+ char coord_name[SP_NODE_NAME];
+ int32 coord_procid;
+ int32 coord_backendid;
+ int32 len;
+ MemoryContext oldContext;
+ int old_procid;
+ StringInfoData buf;
+
+ len = pq_getmsgint(message, sizeof(len));
+ if (len >= SP_NODE_NAME)
+ ereport(ERROR,
+ (EINVAL,
+ errmsg("Invalid name length.")));
+
+ memcpy(coord_name, (char *)pq_getmsgbytes(message, len), len);
+ coord_name[len] = '\0';
+
+ coord_procid = pq_getmsgint(message, sizeof(coord_procid));
+
+ coord_backendid = pq_getmsgint(message, sizeof(coord_backendid));
+
+ /*
+ * Check if all required data are supplied
+ */
+ if (len > 0 || coord_procid > 0 || coord_backendid != InvalidBackendId)
+ {
+ oldContext = MemoryContextSwitchTo(TopMostMemoryContext);
+
+ /*
+ * Register the session
+ */
+ old_procid = Recovery_PGXCNodeRegisterCoordProcess(coord_name, coord_procid,
+ coord_backendid);
+ MemoryContextSwitchTo(oldContext);
+
+ /*
+ * If there was a session with same backend id clean it up.
+ */
+ if (old_procid)
+ GTM_CleanupSeqSession(coord_name, old_procid);
+ }
+
+ /*
+ * If there is a standby forward the info to it
+ */
+ if (GetMyThreadInfo->thr_conn->standby)
+ {
+ int _rc;
+ GTM_Conn *oldconn = GetMyThreadInfo->thr_conn->standby;
+ int count = 0;
+ GTM_PGXCNodeInfo *standbynode;
+
+ elog(DEBUG1, "calling register_session() for standby GTM %p.",
+ GetMyThreadInfo->thr_conn->standby);
+
+ do
+ {
+ _rc = register_session(GetMyThreadInfo->thr_conn->standby,
+ coord_name, coord_procid, coord_backendid);
+
+ elog(DEBUG1, "register_session() returns rc %d.", _rc);
+ }
+ while (gtm_standby_check_communication_error(&count, oldconn));
+
+ /* Now check if there're other standby registered. */
+ standbynode = find_standby_node_info();
+ if (!standbynode)
+ GTMThreads->gt_standby_ready = false;
+
+ if (Backup_synchronously && (myport->remote_type != GTM_NODE_GTM_PROXY))
+ gtm_sync_standby(GetMyThreadInfo->thr_conn->standby);
+
+ }
+
+ /* Make up response */
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, REGISTER_SESSION_RESULT, 4);
+ /* For proxy write out header */
+ if (myport->remote_type == GTM_NODE_GTM_PROXY)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_endmessage(myport, &buf);
+ /* Flush connections */
+ if (myport->remote_type != GTM_NODE_GTM_PROXY)
+ {
+ if (GetMyThreadInfo->thr_conn->standby)
+ gtmpqFlush(GetMyThreadInfo->thr_conn->standby);
+ pq_flush(myport);
+ }
+}
+#endif
diff --git a/src/gtm/recovery/replication.c b/src/gtm/recovery/replication.c
new file mode 100644
index 0000000000..bc04b191db
--- /dev/null
+++ b/src/gtm/recovery/replication.c
@@ -0,0 +1,129 @@
+/*-------------------------------------------------------------------------
+ *
+ * replication.c
+ * Controlling the initialization and end of replication process of GTM data
+ *
+ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/gtm/recovery/replication.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "gtm/replication.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "gtm/gtm_c.h"
+#include "gtm/gtm.h"
+#include "gtm/gtm_txn.h"
+#include "gtm/standby_utils.h"
+#include "gtm/gtm_standby.h"
+#include "gtm/register.h"
+#include "gtm/assert.h"
+#include <stdio.h>
+#include "gtm/libpq.h"
+#include "gtm/pqformat.h"
+#include "gtm/gtm_msg.h"
+#include "gtm/gtm_ip.h"
+
+/*
+ * Process MSG_NODE_BEGIN_REPLICATION_INIT
+ */
+void
+ProcessBeginReplicationInitialSyncRequest(Port *myport, StringInfo message)
+{
+ StringInfoData buf;
+ MemoryContext oldContext;
+
+ pq_getmsgend(message);
+
+ if (Recovery_IsStandby())
+ ereport(ERROR,
+ (EPERM,
+ errmsg("Operation not permitted under the standby mode.")));
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /* Acquire global locks to copy resource data to the standby. */
+ GTM_RWLockAcquire(&GTMTransactions.gt_XidGenLock, GTM_LOCKMODE_WRITE);
+ GTM_RWLockAcquire(&GTMTransactions.gt_TransArrayLock, GTM_LOCKMODE_WRITE);
+ elog(DEBUG1, "Prepared for copying data with holding XidGenLock and TransArrayLock.");
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, NODE_BEGIN_REPLICATION_INIT_RESULT, 4);
+ if (myport->remote_type == GTM_NODE_GTM_PROXY)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_endmessage(myport, &buf);
+
+ /*
+ * Beause this command comes from the standby, we don't have to flush
+ * messages to the standby here.
+ */
+ if (myport->remote_type != GTM_NODE_GTM_PROXY)
+ pq_flush(myport);
+
+ elog(DEBUG1, "ProcessBeginReplicationInitialSyncRequest() ok.");
+
+ return;
+}
+
+/*
+ * Process MSG_NODE_END_REPLICATION_INIT
+ */
+void
+ProcessEndReplicationInitialSyncRequest(Port *myport, StringInfo message)
+{
+ StringInfoData buf;
+ MemoryContext oldContext;
+
+ pq_getmsgend(message);
+
+ if (Recovery_IsStandby())
+ ereport(ERROR,
+ (EPERM,
+ errmsg("Operation not permitted under the standby mode.")));
+
+ oldContext = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * Release global locks after copying resource data to the standby.
+ */
+ GTM_RWLockRelease(&GTMTransactions.gt_TransArrayLock);
+ GTM_RWLockRelease(&GTMTransactions.gt_XidGenLock);
+ elog(DEBUG1, "XidGenLock and TransArrayLock released.");
+
+ MemoryContextSwitchTo(oldContext);
+
+ pq_beginmessage(&buf, 'S');
+ pq_sendint(&buf, NODE_END_REPLICATION_INIT_RESULT, 4);
+ if (myport->remote_type == GTM_NODE_GTM_PROXY)
+ {
+ GTM_ProxyMsgHeader proxyhdr;
+ proxyhdr.ph_conid = myport->conn_id;
+ pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader));
+ }
+ pq_endmessage(myport, &buf);
+
+ /*
+ * Beause this command comes from the standby, we don't have to flush
+ * messages to the standby here.
+ */
+ if (myport->remote_type != GTM_NODE_GTM_PROXY)
+ pq_flush(myport);
+
+ elog(DEBUG1, "ProcessEndReplicationInitialSyncRequest() ok.");
+
+ return;
+}
diff --git a/src/include/Makefile b/src/include/Makefile
index 5f5e6819d6..74de25eb3e 100644
--- a/src/include/Makefile
+++ b/src/include/Makefile
@@ -22,8 +22,7 @@ SUBDIRS = access bootstrap catalog commands datatype executor foreign lib libpq
tcop snowball snowball/libstemmer tsearch tsearch/dicts utils \
port port/win32 port/win32_msvc port/win32_msvc/sys \
port/win32/arpa port/win32/netinet port/win32/sys \
- portability \
- gtm
+ portability gtm
# Install all headers
install: all installdirs
diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h
index 5c5692b2c5..5c70872a1b 100644
--- a/src/include/access/gtm.h
+++ b/src/include/access/gtm.h
@@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
*
* gtm.h
- *
+ *
* Module interfacing with GTM definitions
*
*
@@ -16,6 +16,9 @@
extern char *GtmHost;
extern int GtmPort;
+#ifdef XCP
+extern bool IsXidFromGTM;
+#endif
extern GlobalTransactionId currentGxid;
extern bool IsGTMConnected(void);
@@ -43,9 +46,15 @@ extern int RegisterGTM(GTM_PGXCNodeType type, GTM_PGXCNodePort port, char *dataf
extern int UnregisterGTM(GTM_PGXCNodeType type);
/* Sequence interface APIs with GTM */
+extern GTM_Sequence GetCurrentValGTM(char *seqname);
+#ifdef XCP
+extern GTM_Sequence GetNextValGTM(char *seqname,
+ GTM_Sequence range, GTM_Sequence *rangemax);
+#else
extern GTM_Sequence GetNextValGTM(char *seqname);
+#endif
extern int SetValGTM(char *seqname, GTM_Sequence nextval, bool iscalled);
-extern int CreateSequenceGTM(char *seqname, GTM_Sequence increment,
+extern int CreateSequenceGTM(char *seqname, GTM_Sequence increment,
GTM_Sequence minval, GTM_Sequence maxval, GTM_Sequence startval,
bool cycle);
extern int AlterSequenceGTM(char *seqname, GTM_Sequence increment,
diff --git a/src/include/access/htup.h b/src/include/access/htup.h
index c93a7fb36e..fbb802e4c9 100644
--- a/src/include/access/htup.h
+++ b/src/include/access/htup.h
@@ -4,6 +4,11 @@
* POSTGRES heap tuple definitions.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -534,6 +539,22 @@ typedef HeapTupleData *HeapTuple;
*/
#define GETSTRUCT(TUP) ((char *) ((TUP)->t_data) + (TUP)->t_data->t_hoff)
+#ifdef XCP
+/*
+ * Represents a DataRow message received from a remote node.
+ * Contains originating node number and message body in DataRow format without
+ * message code and length. Length and node number are separate fields.
+ * This is a variable length structure.
+ */
+typedef struct RemoteDataRowData
+{
+ Oid msgnode; /* node number of the data row message */
+ int msglen; /* length of the data row message */
+ char msg[0]; /* last data row message */
+} RemoteDataRowData;
+typedef RemoteDataRowData *RemoteDataRow;
+#endif
+
/*
* Accessor macros to be used with HeapTuple pointers.
*/
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index e9e5edda25..b9d46e3504 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -4,6 +4,11 @@
* postgres transaction access method support code
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -173,6 +178,10 @@ extern TransactionId GetNewTransactionId(bool isSubXact, bool *timestamp_receive
#else
extern TransactionId GetNewTransactionId(bool isSubXact);
#endif /* PGXC */
+#ifdef XCP
+extern bool TransactionIdIsCurrentGlobalTransactionId(TransactionId xid);
+extern TransactionId GetNextTransactionId(void);
+#endif
extern TransactionId ReadNewTransactionId(void);
extern void SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
Oid oldest_datoid);
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 82999726a0..127a849c10 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -4,6 +4,11 @@
* postgres transaction system definitions
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -228,8 +233,10 @@ extern TransactionId GetTopTransactionIdIfAny(void);
extern TransactionId GetCurrentTransactionId(void);
extern TransactionId GetCurrentTransactionIdIfAny(void);
#ifdef PGXC /* PGXC_COORD */
+#ifndef XCP
extern bool GetCurrentLocalParamStatus(void);
extern void SetCurrentLocalParamStatus(bool status);
+#endif
extern GlobalTransactionId GetAuxilliaryTransactionId(void);
extern GlobalTransactionId GetTopGlobalTransactionId(void);
extern void SetAuxilliaryTransactionId(GlobalTransactionId gxid);
diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h
index 47224d3dd2..7f0ed6db2b 100644
--- a/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@ -28,8 +28,7 @@ typedef enum
WalReceiverProcess,
#ifdef PGXC
PoolerProcess,
-#endif
-
+#endif
NUM_AUXPROCTYPES /* Must be last! */
} AuxProcType;
diff --git a/src/include/catalog/catalog.h b/src/include/catalog/catalog.h
index 678a945271..5a1861da6e 100644
--- a/src/include/catalog/catalog.h
+++ b/src/include/catalog/catalog.h
@@ -4,6 +4,11 @@
* prototypes for functions in backend/catalog/catalog.c
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -36,8 +41,13 @@ extern char *relpathbackend(RelFileNode rnode, BackendId backend,
extern char *GetDatabasePath(Oid dbNode, Oid spcNode);
/* First argument is a RelFileNodeBackend */
+#ifdef XCP
+#define relpath(rnode, forknum) \
+ relpathbackend((rnode).node, InvalidBackendId, (forknum))
+#else
#define relpath(rnode, forknum) \
relpathbackend((rnode).node, (rnode).backend, (forknum))
+#endif
/* First argument is a RelFileNode */
#define relpathperm(rnode, forknum) \
diff --git a/src/include/catalog/namespace.h b/src/include/catalog/namespace.h
index 76215dc8a1..0c2d245e90 100644
--- a/src/include/catalog/namespace.h
+++ b/src/include/catalog/namespace.h
@@ -4,6 +4,11 @@
* prototypes for functions in backend/catalog/namespace.c
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -125,6 +130,9 @@ extern bool isOtherTempNamespace(Oid namespaceId);
extern int GetTempNamespaceBackendId(Oid namespaceId);
extern Oid GetTempToastNamespace(void);
extern void ResetTempTableNamespace(void);
+#ifdef XCP
+extern void ForgetTempTableNamespace(void);
+#endif
extern OverrideSearchPath *GetOverrideSearchPath(MemoryContext context);
extern OverrideSearchPath *CopyOverrideSearchPath(OverrideSearchPath *path);
diff --git a/src/include/catalog/pg_aggregate.h b/src/include/catalog/pg_aggregate.h
index ce8fc1b156..7bf70e4ff8 100644
--- a/src/include/catalog/pg_aggregate.h
+++ b/src/include/catalog/pg_aggregate.h
@@ -5,6 +5,11 @@
* along with the relation's initial contents.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -40,6 +45,9 @@
* aggfinalfn final function (0 if none)
* aggsortop associated sort operator (0 if none)
* aggtranstype type of aggregate's transition (state) data
+#ifdef PGXC
+ * aggcollecttype type of aggregate's collection (state) data
+#endif
* agginitval initial value for transition state (can be NULL)
#ifdef PGXC
* agginitcollect initial value for collection state (can be NULL)
@@ -56,6 +64,7 @@ CATALOG(pg_aggregate,2600) BKI_WITHOUT_OIDS
regproc aggfinalfn;
Oid aggsortop;
Oid aggtranstype;
+ Oid aggcollecttype; /* PGXC */
#ifdef CATALOG_VARLEN /* variable-length fields start here */
text agginitval;
@@ -76,15 +85,16 @@ typedef FormData_pg_aggregate *Form_pg_aggregate;
*/
#ifdef PGXC
-#define Natts_pg_aggregate 8
+#define Natts_pg_aggregate 9
#define Anum_pg_aggregate_aggfnoid 1
#define Anum_pg_aggregate_aggtransfn 2
#define Anum_pg_aggregate_aggcollectfn 3
#define Anum_pg_aggregate_aggfinalfn 4
#define Anum_pg_aggregate_aggsortop 5
#define Anum_pg_aggregate_aggtranstype 6
-#define Anum_pg_aggregate_agginitval 7
-#define Anum_pg_aggregate_agginitcollect 8
+#define Anum_pg_aggregate_aggcollecttype 7
+#define Anum_pg_aggregate_agginitval 8
+#define Anum_pg_aggregate_agginitcollect 9
#endif
#ifdef PGXC
//#define Natts_pg_aggregate 6
@@ -104,13 +114,13 @@ typedef FormData_pg_aggregate *Form_pg_aggregate;
/* avg */
#ifdef PGXC
-DATA(insert ( 2100 int8_avg_accum numeric_avg_collect numeric_avg 0 1231 "{0,0}" "{0,0}" ));
-DATA(insert ( 2101 int4_avg_accum int8_avg_collect int8_avg 0 1016 "{0,0}" "{0,0}" ));
-DATA(insert ( 2102 int2_avg_accum int8_avg_collect int8_avg 0 1016 "{0,0}" "{0,0}" ));
-DATA(insert ( 2103 numeric_avg_accum numeric_avg_collect numeric_avg 0 1231 "{0,0}" "{0,0}" ));
-DATA(insert ( 2104 float4_accum float8_collect float8_avg 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2105 float8_accum float8_collect float8_avg 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2106 interval_accum interval_collect interval_avg 0 1187 "{0 second,0 second}" "{0 second,0 second}" ));
+DATA(insert ( 2100 int8_avg_accum numeric_avg_collect numeric_avg 0 1231 1231 "{0,0}" "{0,0}" ));
+DATA(insert ( 2101 int4_avg_accum int8_avg_collect int8_avg 0 1016 1016 "{0,0}" "{0,0}" ));
+DATA(insert ( 2102 int2_avg_accum int8_avg_collect int8_avg 0 1016 1016 "{0,0}" "{0,0}" ));
+DATA(insert ( 2103 numeric_avg_accum numeric_avg_collect numeric_avg 0 1231 1231 "{0,0}" "{0,0}" ));
+DATA(insert ( 2104 float4_accum float8_collect float8_avg 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2105 float8_accum float8_collect float8_avg 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2106 interval_accum interval_collect interval_avg 0 1187 1187 "{0 second,0 second}" "{0 second,0 second}" ));
#endif
#ifdef PGXC
//DATA(insert ( 2100 int8_avg_accum numeric_avg 0 1231 "{0,0}" ));
@@ -124,14 +134,14 @@ DATA(insert ( 2106 interval_accum interval_collect interval_avg 0 1187 "{0 secon
/* sum */
#ifdef PGXC
-DATA(insert ( 2107 int8_sum numeric_add - 0 1700 _null_ "0" ));
-DATA(insert ( 2108 int4_sum int8_sum_to_int8 - 0 20 _null_ _null_ ));
-DATA(insert ( 2109 int2_sum int8_sum_to_int8 - 0 20 _null_ _null_ ));
-DATA(insert ( 2110 float4pl float4pl - 0 700 _null_ "0" ));
-DATA(insert ( 2111 float8pl float8pl - 0 701 _null_ "0" ));
-DATA(insert ( 2112 cash_pl cash_pl - 0 790 _null_ _null_ ));
-DATA(insert ( 2113 interval_pl interval_pl - 0 1186 _null_ _null_ ));
-DATA(insert ( 2114 numeric_add numeric_add - 0 1700 _null_ "0" ));
+DATA(insert ( 2107 int8_sum numeric_add - 0 1700 1700 _null_ _null_ ));
+DATA(insert ( 2108 int4_sum int8_sum_to_int8 - 0 20 20 _null_ _null_ ));
+DATA(insert ( 2109 int2_sum int8_sum_to_int8 - 0 20 20 _null_ _null_ ));
+DATA(insert ( 2110 float4pl float4pl - 0 700 700 _null_ _null_ ));
+DATA(insert ( 2111 float8pl float8pl - 0 701 701 _null_ _null_ ));
+DATA(insert ( 2112 cash_pl cash_pl - 0 790 790 _null_ _null_ ));
+DATA(insert ( 2113 interval_pl interval_pl - 0 1186 1186 _null_ _null_ ));
+DATA(insert ( 2114 numeric_add numeric_add - 0 1700 1700 _null_ _null_ ));
#endif
#ifdef PGXC
//DATA(insert ( 2107 int8_sum - 0 1700 _null_ ));
@@ -146,26 +156,26 @@ DATA(insert ( 2114 numeric_add numeric_add - 0 1700 _null_ "0" ));
/* max */
#ifdef PGXC
-DATA(insert ( 2115 int8larger int8larger - 413 20 _null_ _null_ ));
-DATA(insert ( 2116 int4larger int4larger - 521 23 _null_ _null_ ));
-DATA(insert ( 2117 int2larger int2larger - 520 21 _null_ _null_ ));
-DATA(insert ( 2118 oidlarger oidlarger - 610 26 _null_ _null_ ));
-DATA(insert ( 2119 float4larger float4larger - 623 700 _null_ _null_ ));
-DATA(insert ( 2120 float8larger float8larger - 674 701 _null_ _null_ ));
-DATA(insert ( 2121 int4larger int4larger - 563 702 _null_ _null_ ));
-DATA(insert ( 2122 date_larger date_larger - 1097 1082 _null_ _null_ ));
-DATA(insert ( 2123 time_larger time_larger - 1112 1083 _null_ _null_ ));
-DATA(insert ( 2124 timetz_larger timetz_larger - 1554 1266 _null_ _null_ ));
-DATA(insert ( 2125 cashlarger cashlarger - 903 790 _null_ _null_ ));
-DATA(insert ( 2126 timestamp_larger timestamp_larger - 2064 1114 _null_ _null_ ));
-DATA(insert ( 2127 timestamptz_larger timestamptz_larger - 1324 1184 _null_ _null_ ));
-DATA(insert ( 2128 interval_larger interval_larger - 1334 1186 _null_ _null_ ));
-DATA(insert ( 2129 text_larger text_larger - 666 25 _null_ _null_ ));
-DATA(insert ( 2130 numeric_larger numeric_larger - 1756 1700 _null_ _null_ ));
-DATA(insert ( 2050 array_larger array_larger - 1073 2277 _null_ _null_ ));
-DATA(insert ( 2244 bpchar_larger bpchar_larger - 1060 1042 _null_ _null_ ));
-DATA(insert ( 2797 tidlarger tidlarger - 2800 27 _null_ _null_ ));
-DATA(insert ( 3526 enum_larger enum_larger - 3519 3500 _null_ _null_ ));
+DATA(insert ( 2115 int8larger int8larger - 413 20 20 _null_ _null_ ));
+DATA(insert ( 2116 int4larger int4larger - 521 23 23 _null_ _null_ ));
+DATA(insert ( 2117 int2larger int2larger - 520 21 21 _null_ _null_ ));
+DATA(insert ( 2118 oidlarger oidlarger - 610 26 26 _null_ _null_ ));
+DATA(insert ( 2119 float4larger float4larger - 623 700 700 _null_ _null_ ));
+DATA(insert ( 2120 float8larger float8larger - 674 701 701 _null_ _null_ ));
+DATA(insert ( 2121 int4larger int4larger - 563 702 702 _null_ _null_ ));
+DATA(insert ( 2122 date_larger date_larger - 1097 1082 1082 _null_ _null_ ));
+DATA(insert ( 2123 time_larger time_larger - 1112 1083 1083 _null_ _null_ ));
+DATA(insert ( 2124 timetz_larger timetz_larger - 1554 1266 1266 _null_ _null_ ));
+DATA(insert ( 2125 cashlarger cashlarger - 903 790 790 _null_ _null_ ));
+DATA(insert ( 2126 timestamp_larger timestamp_larger - 2064 1114 1114 _null_ _null_ ));
+DATA(insert ( 2127 timestamptz_larger timestamptz_larger - 1324 1184 1184 _null_ _null_ ));
+DATA(insert ( 2128 interval_larger interval_larger - 1334 1186 1186 _null_ _null_ ));
+DATA(insert ( 2129 text_larger text_larger - 666 25 25 _null_ _null_ ));
+DATA(insert ( 2130 numeric_larger numeric_larger - 1756 1700 1700 _null_ _null_ ));
+DATA(insert ( 2050 array_larger array_larger - 1073 2277 2277 _null_ _null_ ));
+DATA(insert ( 2244 bpchar_larger bpchar_larger - 1060 1042 1042 _null_ _null_ ));
+DATA(insert ( 2797 tidlarger tidlarger - 2800 27 27 _null_ _null_ ));
+DATA(insert ( 3526 enum_larger enum_larger - 3519 3500 3500 _null_ _null_ ));
#endif
#ifdef PGXC
//DATA(insert ( 2115 int8larger - 413 20 _null_ ));
@@ -192,26 +202,26 @@ DATA(insert ( 3526 enum_larger enum_larger - 3519 3500 _null_ _null_ ));
/* min */
#ifdef PGXC
-DATA(insert ( 2131 int8smaller int8smaller - 412 20 _null_ _null_ ));
-DATA(insert ( 2132 int4smaller int4smaller - 97 23 _null_ _null_ ));
-DATA(insert ( 2133 int2smaller int2smaller - 95 21 _null_ _null_ ));
-DATA(insert ( 2134 oidsmaller oidsmaller - 609 26 _null_ _null_ ));
-DATA(insert ( 2135 float4smaller float4smaller - 622 700 _null_ _null_ ));
-DATA(insert ( 2136 float8smaller float8smaller - 672 701 _null_ _null_ ));
-DATA(insert ( 2137 int4smaller int4smaller - 562 702 _null_ _null_ ));
-DATA(insert ( 2138 date_smaller date_smaller - 1095 1082 _null_ _null_ ));
-DATA(insert ( 2139 time_smaller time_smaller - 1110 1083 _null_ _null_ ));
-DATA(insert ( 2140 timetz_smaller timetz_smaller - 1552 1266 _null_ _null_ ));
-DATA(insert ( 2141 cashsmaller cashsmaller - 902 790 _null_ _null_ ));
-DATA(insert ( 2142 timestamp_smaller timestamp_smaller - 2062 1114 _null_ _null_ ));
-DATA(insert ( 2143 timestamptz_smaller timestamptz_smaller - 1322 1184 _null_ _null_ ));
-DATA(insert ( 2144 interval_smaller interval_smaller - 1332 1186 _null_ _null_ ));
-DATA(insert ( 2145 text_smaller text_smaller - 664 25 _null_ _null_ ));
-DATA(insert ( 2146 numeric_smaller numeric_smaller - 1754 1700 _null_ _null_ ));
-DATA(insert ( 2051 array_smaller array_smaller - 1072 2277 _null_ _null_ ));
-DATA(insert ( 2245 bpchar_smaller bpchar_smaller - 1058 1042 _null_ _null_ ));
-DATA(insert ( 2798 tidsmaller tidsmaller - 2799 27 _null_ _null_ ));
-DATA(insert ( 3527 enum_smaller enum_smaller - 3518 3500 _null_ _null_ ));
+DATA(insert ( 2131 int8smaller int8smaller - 412 20 20 _null_ _null_ ));
+DATA(insert ( 2132 int4smaller int4smaller - 97 23 23 _null_ _null_ ));
+DATA(insert ( 2133 int2smaller int2smaller - 95 21 21 _null_ _null_ ));
+DATA(insert ( 2134 oidsmaller oidsmaller - 609 26 26 _null_ _null_ ));
+DATA(insert ( 2135 float4smaller float4smaller - 622 700 700 _null_ _null_ ));
+DATA(insert ( 2136 float8smaller float8smaller - 672 701 701 _null_ _null_ ));
+DATA(insert ( 2137 int4smaller int4smaller - 562 702 702 _null_ _null_ ));
+DATA(insert ( 2138 date_smaller date_smaller - 1095 1082 1082 _null_ _null_ ));
+DATA(insert ( 2139 time_smaller time_smaller - 1110 1083 1083 _null_ _null_ ));
+DATA(insert ( 2140 timetz_smaller timetz_smaller - 1552 1266 1266 _null_ _null_ ));
+DATA(insert ( 2141 cashsmaller cashsmaller - 902 790 790 _null_ _null_ ));
+DATA(insert ( 2142 timestamp_smaller timestamp_smaller - 2062 1114 1114 _null_ _null_ ));
+DATA(insert ( 2143 timestamptz_smaller timestamptz_smaller - 1322 1184 1184 _null_ _null_ ));
+DATA(insert ( 2144 interval_smaller interval_smaller - 1332 1186 1186 _null_ _null_ ));
+DATA(insert ( 2145 text_smaller text_smaller - 664 25 25 _null_ _null_ ));
+DATA(insert ( 2146 numeric_smaller numeric_smaller - 1754 1700 1700 _null_ _null_ ));
+DATA(insert ( 2051 array_smaller array_smaller - 1072 2277 2277 _null_ _null_ ));
+DATA(insert ( 2245 bpchar_smaller bpchar_smaller - 1058 1042 1042 _null_ _null_ ));
+DATA(insert ( 2798 tidsmaller tidsmaller - 2799 27 27 _null_ _null_ ));
+DATA(insert ( 3527 enum_smaller enum_smaller - 3518 3500 3500 _null_ _null_ ));
#endif
#ifdef PGXC
//DATA(insert ( 2131 int8smaller - 412 20 _null_ ));
@@ -239,8 +249,8 @@ DATA(insert ( 3527 enum_smaller enum_smaller - 3518 3500 _null_ _null_ ));
/* count */
/* Final function is data type conversion function numeric_int8 is referenced by OID because of ambiguous definition in pg_proc */
#ifdef PGXC
-DATA(insert ( 2147 int8inc_any int8_sum_to_int8 - 0 20 "0" "0" ));
-DATA(insert ( 2803 int8inc int8_sum_to_int8 - 0 20 "0" "0" ));
+DATA(insert ( 2147 int8inc_any int8_sum_to_int8 - 0 20 20 "0" _null_ ));
+DATA(insert ( 2803 int8inc int8_sum_to_int8 - 0 20 20 "0" _null_ ));
#endif
#ifdef PGXC
//DATA(insert ( 2147 int8inc_any - 0 20 "0" ));
@@ -249,12 +259,12 @@ DATA(insert ( 2803 int8inc int8_sum_to_int8 - 0 20 "0" "0" ));
/* var_pop */
#ifdef PGXC
-DATA(insert ( 2718 int8_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2719 int4_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2720 int2_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2721 float4_accum float8_collect float8_var_pop 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2722 float8_accum float8_collect float8_var_pop 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2723 numeric_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2718 int8_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2719 int4_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2720 int2_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2721 float4_accum float8_collect float8_var_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2722 float8_accum float8_collect float8_var_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2723 numeric_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
#endif
#ifdef PGXC
//DATA(insert ( 2718 int8_accum numeric_var_pop 0 1231 "{0,0,0}" ));
@@ -267,12 +277,12 @@ DATA(insert ( 2723 numeric_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0
/* var_samp */
#ifdef PGXC
-DATA(insert ( 2641 int8_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2642 int4_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2643 int2_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2644 float4_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2645 float8_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2646 numeric_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2641 int8_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2642 int4_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2643 int2_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2644 float4_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2645 float8_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2646 numeric_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
#endif
#ifdef PGXC
//DATA(insert ( 2641 int8_accum numeric_var_samp 0 1231 "{0,0,0}" ));
@@ -285,12 +295,12 @@ DATA(insert ( 2646 numeric_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0
/* variance: historical Postgres syntax for var_samp */
#ifdef PGXC
-DATA(insert ( 2148 int8_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2149 int4_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2150 int2_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2151 float4_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2152 float8_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2153 numeric_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2148 int8_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2149 int4_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2150 int2_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2151 float4_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2152 float8_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2153 numeric_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
#endif
#ifdef PGXC
//DATA(insert ( 2148 int8_accum numeric_var_samp 0 1231 "{0,0,0}" ));
@@ -303,12 +313,12 @@ DATA(insert ( 2153 numeric_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0
/* stddev_pop */
#ifdef PGXC
-DATA(insert ( 2724 int8_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2725 int4_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2726 int2_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2727 float4_accum float8_collect float8_stddev_pop 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2728 float8_accum float8_collect float8_stddev_pop 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2729 numeric_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2724 int8_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2725 int4_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2726 int2_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2727 float4_accum float8_collect float8_stddev_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2728 float8_accum float8_collect float8_stddev_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2729 numeric_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
#endif
#ifdef PGXC
//DATA(insert ( 2724 int8_accum numeric_stddev_pop 0 1231 "{0,0,0}" ));
@@ -321,12 +331,12 @@ DATA(insert ( 2729 numeric_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0
/* stddev_samp */
#ifdef PGXC
-DATA(insert ( 2712 int8_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2713 int4_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2714 int2_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2715 float4_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2716 float8_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2717 numeric_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2712 int8_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2713 int4_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2714 int2_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2715 float4_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2716 float8_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2717 numeric_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
#endif
#ifdef PGXC
//DATA(insert ( 2712 int8_accum numeric_stddev_samp 0 1231 "{0,0,0}" ));
@@ -339,12 +349,12 @@ DATA(insert ( 2717 numeric_accum numeric_collect numeric_stddev_samp 0 1231 "{0,
/* stddev: historical Postgres syntax for stddev_samp */
#ifdef PGXC
-DATA(insert ( 2154 int8_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2155 int4_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2156 int2_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2157 float4_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2158 float8_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" ));
-DATA(insert ( 2159 numeric_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2154 int8_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2155 int4_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2156 int2_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2157 float4_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2158 float8_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" ));
+DATA(insert ( 2159 numeric_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" ));
#endif
#ifdef PGXC
//DATA(insert ( 2154 int8_accum numeric_stddev_samp 0 1231 "{0,0,0}" ));
@@ -357,18 +367,18 @@ DATA(insert ( 2159 numeric_accum numeric_collect numeric_stddev_samp 0 1231 "{0,
/* SQL2003 binary regression aggregates */
#ifdef PGXC
-DATA(insert ( 2818 int8inc_float8_float8 int8_sum_to_int8 - 0 20 "0" _null_ ));
-DATA(insert ( 2819 float8_regr_accum float8_regr_collect float8_regr_sxx 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
-DATA(insert ( 2820 float8_regr_accum float8_regr_collect float8_regr_syy 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
-DATA(insert ( 2821 float8_regr_accum float8_regr_collect float8_regr_sxy 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
-DATA(insert ( 2822 float8_regr_accum float8_regr_collect float8_regr_avgx 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
-DATA(insert ( 2823 float8_regr_accum float8_regr_collect float8_regr_avgy 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
-DATA(insert ( 2824 float8_regr_accum float8_regr_collect float8_regr_r2 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
-DATA(insert ( 2825 float8_regr_accum float8_regr_collect float8_regr_slope 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
-DATA(insert ( 2826 float8_regr_accum float8_regr_collect float8_regr_intercept 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
-DATA(insert ( 2827 float8_regr_accum float8_regr_collect float8_covar_pop 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
-DATA(insert ( 2828 float8_regr_accum float8_regr_collect float8_covar_samp 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
-DATA(insert ( 2829 float8_regr_accum float8_regr_collect float8_corr 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
+DATA(insert ( 2818 int8inc_float8_float8 int8_sum_to_int8 - 0 20 20 "0" _null_ ));
+DATA(insert ( 2819 float8_regr_accum float8_regr_collect float8_regr_sxx 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
+DATA(insert ( 2820 float8_regr_accum float8_regr_collect float8_regr_syy 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
+DATA(insert ( 2821 float8_regr_accum float8_regr_collect float8_regr_sxy 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
+DATA(insert ( 2822 float8_regr_accum float8_regr_collect float8_regr_avgx 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
+DATA(insert ( 2823 float8_regr_accum float8_regr_collect float8_regr_avgy 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
+DATA(insert ( 2824 float8_regr_accum float8_regr_collect float8_regr_r2 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
+DATA(insert ( 2825 float8_regr_accum float8_regr_collect float8_regr_slope 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
+DATA(insert ( 2826 float8_regr_accum float8_regr_collect float8_regr_intercept 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
+DATA(insert ( 2827 float8_regr_accum float8_regr_collect float8_covar_pop 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
+DATA(insert ( 2828 float8_regr_accum float8_regr_collect float8_covar_samp 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
+DATA(insert ( 2829 float8_regr_accum float8_regr_collect float8_corr 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" ));
#endif
#ifdef PGXC
//DATA(insert ( 2818 int8inc_float8_float8 - 0 20 "0" ));
@@ -387,9 +397,9 @@ DATA(insert ( 2829 float8_regr_accum float8_regr_collect float8_corr 0 1022 "
/* boolean-and and boolean-or */
#ifdef PGXC
-DATA(insert ( 2517 booland_statefunc booland_statefunc - 58 16 _null_ _null_ ));
-DATA(insert ( 2518 boolor_statefunc boolor_statefunc - 59 16 _null_ _null_ ));
-DATA(insert ( 2519 booland_statefunc booland_statefunc - 58 16 _null_ _null_ ));
+DATA(insert ( 2517 booland_statefunc booland_statefunc - 58 16 16 _null_ _null_ ));
+DATA(insert ( 2518 boolor_statefunc boolor_statefunc - 59 16 16 _null_ _null_ ));
+DATA(insert ( 2519 booland_statefunc booland_statefunc - 58 16 16 _null_ _null_ ));
#endif
#ifdef PGXC
//DATA(insert ( 2517 booland_statefunc - 58 16 _null_ ));
@@ -399,14 +409,14 @@ DATA(insert ( 2519 booland_statefunc booland_statefunc - 58 16 _null_ _null_ )
/* bitwise integer */
#ifdef PGXC
-DATA(insert ( 2236 int2and int2and - 0 21 _null_ _null_ ));
-DATA(insert ( 2237 int2or int2or - 0 21 _null_ _null_ ));
-DATA(insert ( 2238 int4and int4and - 0 23 _null_ _null_ ));
-DATA(insert ( 2239 int4or int4or - 0 23 _null_ _null_ ));
-DATA(insert ( 2240 int8and int8and - 0 20 _null_ _null_ ));
-DATA(insert ( 2241 int8or int8or - 0 20 _null_ _null_ ));
-DATA(insert ( 2242 bitand bitand - 0 1560 _null_ _null_ ));
-DATA(insert ( 2243 bitor bitor - 0 1560 _null_ _null_ ));
+DATA(insert ( 2236 int2and int2and - 0 21 21 _null_ _null_ ));
+DATA(insert ( 2237 int2or int2or - 0 21 21 _null_ _null_ ));
+DATA(insert ( 2238 int4and int4and - 0 23 23 _null_ _null_ ));
+DATA(insert ( 2239 int4or int4or - 0 23 23 _null_ _null_ ));
+DATA(insert ( 2240 int8and int8and - 0 20 20 _null_ _null_ ));
+DATA(insert ( 2241 int8or int8or - 0 20 20 _null_ _null_ ));
+DATA(insert ( 2242 bitand bitand - 0 1560 1560 _null_ _null_ ));
+DATA(insert ( 2243 bitor bitor - 0 1560 1560 _null_ _null_ ));
#endif
#ifdef PGXC
//DATA(insert ( 2236 int2and - 0 21 _null_ ));
@@ -421,7 +431,7 @@ DATA(insert ( 2243 bitor bitor - 0 1560 _null_ _null_ ));
/* xml */
#ifdef PGXC
-DATA(insert ( 2901 xmlconcat2 xmlconcat2 - 0 142 _null_ _null_ ));
+DATA(insert ( 2901 xmlconcat2 - - 0 142 0 _null_ _null_ ));
#endif
#ifdef PGXC
//DATA(insert ( 2901 xmlconcat2 - 0 142 _null_ ));
@@ -429,7 +439,7 @@ DATA(insert ( 2901 xmlconcat2 xmlconcat2 - 0 142 _null_ _null_ ));
/* array */
#ifdef PGXC
-DATA(insert ( 2335 array_agg_transfn - array_agg_finalfn 0 2281 _null_ _null_ ));
+DATA(insert ( 2335 array_agg_transfn - array_agg_finalfn 0 2281 0 _null_ _null_ ));
#endif
#ifdef PGXC
//DATA(insert ( 2335 array_agg_transfn array_agg_finalfn 0 2281 _null_ ));
@@ -437,15 +447,18 @@ DATA(insert ( 2335 array_agg_transfn - array_agg_finalfn 0 2281 _null_ _null_ )
/* text */
#ifdef PGXC
-DATA(insert ( 3538 string_agg_transfn - string_agg_finalfn 0 2281 _null_ _null_ ));
+DATA(insert (3538 string_agg_transfn - string_agg_finalfn 0 2281 0 _null_ _null_ ));
+// XXX function string_agg_delim_transfn is not defined?
+//DATA(insert (3538 string_agg_delim_transfn - string_agg_finalfn 0 2281 0 _null_ _null_ ));
#endif
#ifdef PGXC
-//DATA(insert ( 3538 string_agg_transfn string_agg_finalfn 0 2281 _null_ ));
+//DATA(insert (3535 string_agg_transfn string_agg_finalfn 0 2281 _null_ ));
+//DATA(insert (3538 string_agg_delim_transfn string_agg_finalfn 0 2281 _null_ ));
#endif
/* bytea */
#ifdef PGXC
-DATA(insert ( 3545 bytea_string_agg_transfn - bytea_string_agg_finalfn 0 2281 _null_ _null_ ));
+DATA(insert ( 3545 bytea_string_agg_transfn - bytea_string_agg_finalfn 0 2281 0 _null_ _null_ ));
#endif
#ifdef PGXC
//DATA(insert ( 3545 bytea_string_agg_transfn bytea_string_agg_finalfn 0 2281 _null_ ));
@@ -465,6 +478,9 @@ extern void AggregateCreate(const char *aggName,
List *aggfinalfnName,
List *aggsortopName,
Oid aggTransType,
+#ifdef XCP
+ Oid aggCollectType,
+#endif
#ifdef PGXC
const char *agginitval,
const char *agginitcollect);
diff --git a/src/include/catalog/pg_namespace.h b/src/include/catalog/pg_namespace.h
index e253921278..2e63e8dd6b 100644
--- a/src/include/catalog/pg_namespace.h
+++ b/src/include/catalog/pg_namespace.h
@@ -5,6 +5,11 @@
* along with the relation's initial contents.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -75,6 +80,11 @@ DESCR("reserved schema for TOAST tables");
DATA(insert OID = 2200 ( "public" PGUID _null_ ));
DESCR("standard public schema");
#define PG_PUBLIC_NAMESPACE 2200
+#ifdef XCP
+DATA(insert OID = 9 ( "storm_catalog" PGUID _null_ ));
+DESCR("StormDB catalog schema");
+#define STORM_CATALOG_NAMESPACE 9
+#endif
/*
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index cda3efa91a..d434303e62 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4,6 +4,11 @@
* definition of the system "procedure" relation (pg_proc)
* along with the relation's initial contents.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -226,10 +231,6 @@ DATA(insert OID = 1258 ( textcat PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 2
DATA(insert OID = 84 ( boolne PGNSP PGUID 12 1 0 0 0 f f f t t f i 2 0 16 "16 16" _null_ _null_ _null_ _null_ boolne _null_ _null_ _null_ ));
DATA(insert OID = 89 ( version PGNSP PGUID 12 1 0 0 0 f f f f t f s 0 0 25 "" _null_ _null_ _null_ _null_ pgsql_version _null_ _null_ _null_ ));
DESCR("PostgreSQL version string");
-#ifdef PGXC
-DATA(insert OID = 90 ( pgxc_version PGNSP PGUID 12 1 0 0 0 f f f f t f s 0 0 25 "" _null_ _null_ _null_ _null_ pgxc_version _null_ _null_ _null_ ));
-DESCR("Postgres-XC version string");
-#endif
/* OIDS 100 - 199 */
@@ -4670,6 +4671,12 @@ DATA(insert OID = 3202 ( pgxc_node_str PGNSP PGUID 12 1 0 0 0 f f f f t f s 0 0
DESCR("get the name of the node");
DATA(insert OID = 3203 ( pgxc_is_committed PGNSP PGUID 12 1 1 0 0 f f f f t t s 1 0 16 "28" _null_ _null_ _null_ _null_ pgxc_is_committed _null_ _null_ _null_ ));
DESCR("is given GXID committed or aborted?");
+DATA(insert OID = 3205 ( pgxc_lock_for_backup PGNSP PGUID 12 1 0 0 0 f f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pgxc_lock_for_backup _null_ _null_ _null_ ));
+DESCR("lock the cluster for taking backup");
+#ifdef XCP
+DATA(insert OID = 3204 ( stormdb_promote_standby PGNSP PGUID 12 1 0 0 0 f f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ stormdb_promote_standby _null_ _null_ _null_ ));
+DESCR("touch trigger file on a standby machine to end replication");
+#endif
#endif
/*
diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h
index 25c664b7c9..f87ec04655 100644
--- a/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
@@ -5,6 +5,11 @@
* along with the relation's initial contents.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -573,6 +578,9 @@ DATA(insert OID = 2211 ( _regtype PGNSP PGUID -1 f b A f t \054 0 2206 0 arra
/* uuid */
DATA(insert OID = 2950 ( uuid PGNSP PGUID 16 f b U f t \054 0 0 2951 uuid_in uuid_out uuid_recv uuid_send - - - c p f 0 -1 0 0 _null_ _null_ _null_ ));
DESCR("UUID datatype");
+#ifdef XCP
+#define UUIDOID 2950
+#endif
DATA(insert OID = 2951 ( _uuid PGNSP PGUID -1 f b A f t \054 0 2950 0 array_in array_out array_recv array_send - - array_typanalyze i x f 0 -1 0 0 _null_ _null_ _null_ ));
/* text search */
diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h
index 44c1ae0c8b..a917a06a87 100644
--- a/src/include/commands/sequence.h
+++ b/src/include/commands/sequence.h
@@ -3,6 +3,11 @@
* sequence.h
* prototypes for sequence.c.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -83,6 +88,10 @@ extern void ResetSequence(Oid seq_relid);
extern void seq_redo(XLogRecPtr lsn, XLogRecord *rptr);
extern void seq_desc(StringInfo buf, uint8 xl_info, char *rec);
+#ifdef XCP
+#define DEFAULT_CACHEVAL 1
+extern int SequenceRangeVal;
+#endif
#ifdef PGXC
/*
* List of actions that registered the callback.
diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h
index 7d25bf3a31..e262a1d8d3 100644
--- a/src/include/commands/tablecmds.h
+++ b/src/include/commands/tablecmds.h
@@ -42,11 +42,7 @@ extern void AlterRelationNamespaceInternal(Relation classRel, Oid relOid,
extern void CheckTableNotInUse(Relation rel, const char *stmt);
-#ifdef PGXC
-extern void ExecuteTruncate(TruncateStmt *stmt, const char *sql_statement);
-#else
extern void ExecuteTruncate(TruncateStmt *stmt);
-#endif
extern void SetRelationHasSubclass(Oid relationId, bool relhassubclass);
diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h
index 8e5499e4c5..1f7ba47466 100644
--- a/src/include/commands/trigger.h
+++ b/src/include/commands/trigger.h
@@ -210,9 +210,4 @@ extern int RI_FKey_trigger_type(Oid tgfoid);
extern Datum pg_trigger_depth(PG_FUNCTION_ARGS);
-#ifdef PGXC
-/* Postgres-XC related functions for triggers */
-extern bool pgxc_check_triggers_shippability(Oid relid, CmdType commandType);
-#endif
-
#endif /* TRIGGER_H */
diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 7a50d2fcb3..7d16edaac5 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -4,6 +4,11 @@
* header file for postgres vacuum cleaner and statistics analyzer
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -161,6 +166,10 @@ extern void vacuum_set_xid_limits(int freeze_min_age, int freeze_table_age,
TransactionId *freezeTableLimit);
extern void vac_update_datfrozenxid(void);
extern void vacuum_delay_point(void);
+#ifdef XCP
+extern void vacuum_rel_coordinator(Relation onerel);
+TargetEntry *make_relation_tle(Oid reloid, const char *relname, const char *column);
+#endif
/* in commands/vacuumlazy.c */
extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
diff --git a/src/include/commands/variable.h b/src/include/commands/variable.h
index ebf7757327..14540a7561 100644
--- a/src/include/commands/variable.h
+++ b/src/include/commands/variable.h
@@ -2,6 +2,11 @@
* variable.h
* Routines for handling specialized SET variables.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -33,6 +38,10 @@ extern bool check_client_encoding(char **newval, void **extra, GucSource source)
extern void assign_client_encoding(const char *newval, void *extra);
extern bool check_session_authorization(char **newval, void **extra, GucSource source);
extern void assign_session_authorization(const char *newval, void *extra);
+#ifdef XCP
+extern bool check_global_session(char **newval, void **extra, GucSource source);
+extern void assign_global_session(const char *newval, void *extra);
+#endif
extern bool check_role(char **newval, void **extra, GucSource source);
extern void assign_role(const char *newval, void *extra);
extern const char *show_role(void);
diff --git a/src/include/executor/execdesc.h b/src/include/executor/execdesc.h
index 48d01df1b6..b6c2400ffd 100644
--- a/src/include/executor/execdesc.h
+++ b/src/include/executor/execdesc.h
@@ -5,6 +5,11 @@
* and related modules.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -16,6 +21,9 @@
#define EXECDESC_H
#include "nodes/execnodes.h"
+#ifdef XCP
+#include "pgxc/squeue.h"
+#endif
#include "tcop/dest.h"
@@ -48,6 +56,14 @@ typedef struct QueryDesc
EState *estate; /* executor's query-wide state */
PlanState *planstate; /* tree of per-plan-node state */
+#ifdef XCP
+ SharedQueue squeue; /* the shared memory queue to sent data to other
+ * nodes */
+ int myindex; /* -1 if locally executed subplan is producing
+ * data and distribute via squeue. Otherwise
+ * get local data from squeue */
+#endif
+
/* This is always set NULL by the core system, but plugins can change it */
struct Instrumentation *totaltime; /* total time spent in ExecutorRun */
} QueryDesc;
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 6b9b28fa31..29e8edcc55 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -4,6 +4,11 @@
* support for the POSTGRES executor module
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -61,6 +66,10 @@
#define EXEC_FLAG_SKIP_TRIGGERS 0x0010 /* skip AfterTrigger calls */
#define EXEC_FLAG_WITH_OIDS 0x0020 /* force OIDs in returned tuples */
#define EXEC_FLAG_WITHOUT_OIDS 0x0040 /* force no OIDs in returned tuples */
+#ifdef XCP
+/* distributed executor may never execute the plan on this node */
+#define EXEC_FLAG_SUBPLAN 0x0080
+#endif
/*
@@ -219,6 +228,9 @@ extern void EvalPlanQualEnd(EPQState *epqstate);
* prototypes from functions in execProcnode.c
*/
extern PlanState *ExecInitNode(Plan *node, EState *estate, int eflags);
+#ifdef XCP
+extern void ExecFinishInitProcNode(PlanState *node);
+#endif
extern TupleTableSlot *ExecProcNode(PlanState *node);
extern Node *MultiExecProcNode(PlanState *node);
extern void ExecEndNode(PlanState *node);
diff --git a/src/include/executor/producerReceiver.h b/src/include/executor/producerReceiver.h
new file mode 100644
index 0000000000..1efd957863
--- /dev/null
+++ b/src/include/executor/producerReceiver.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * producerReceiver.h
+ * prototypes for producerReceiver.c
+ *
+ *
+ * Copyright (c) 2012-2014, TransLattice, Inc.
+ *
+ * src/include/executor/producerReceiver.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef PRODUCER_RECEIVER_H
+#define PRODUCER_RECEIVER_H
+
+#include "tcop/dest.h"
+#include "pgxc/locator.h"
+#include "pgxc/squeue.h"
+
+
+extern DestReceiver *CreateProducerDestReceiver(void);
+
+extern void SetProducerDestReceiverParams(DestReceiver *self,
+ AttrNumber distKey,
+ Locator *locator,
+ SharedQueue squeue);
+extern DestReceiver *SetSelfConsumerDestReceiver(DestReceiver *self,
+ DestReceiver *consumer);
+extern void SetProducerTempMemory(DestReceiver *self, MemoryContext tmpcxt);
+extern bool ProducerReceiverPushBuffers(DestReceiver *self);
+
+#endif /* PRODUCER_RECEIVER_H */
diff --git a/src/include/executor/tuptable.h b/src/include/executor/tuptable.h
index 6ea58632fd..693037d1e4 100644
--- a/src/include/executor/tuptable.h
+++ b/src/include/executor/tuptable.h
@@ -4,6 +4,11 @@
* tuple table support stuff
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -118,11 +123,16 @@ typedef struct TupleTableSlot
bool tts_slow; /* saved state for slot_deform_tuple */
HeapTuple tts_tuple; /* physical tuple, or NULL if virtual */
#ifdef PGXC
+#ifdef XCP
+ RemoteDataRow tts_datarow; /* Tuple data in DataRow format */
+ MemoryContext tts_drowcxt; /* Context to store deformed */
+#else
/*
* PGXC extension to support tuples sent from remote Datanode.
*/
char *tts_dataRow; /* Tuple data in DataRow format */
int tts_dataLen; /* Actual length of the data row */
+#endif
bool tts_shouldFreeRow; /* should pfree tts_dataRow? */
struct AttInMetadata *tts_attinmeta; /* store here info to extract values from the DataRow */
#endif
@@ -161,16 +171,30 @@ extern TupleTableSlot *ExecStoreMinimalTuple(MinimalTuple mtup,
TupleTableSlot *slot,
bool shouldFree);
#ifdef PGXC
+#ifdef XCP
+extern TupleTableSlot *ExecStoreDataRowTuple(RemoteDataRow datarow,
+ TupleTableSlot *slot,
+ bool shouldFree);
+#else
extern TupleTableSlot *ExecStoreDataRowTuple(char *msg,
size_t len,
TupleTableSlot *slot,
bool shouldFree);
#endif
+#endif
extern TupleTableSlot *ExecClearTuple(TupleTableSlot *slot);
extern TupleTableSlot *ExecStoreVirtualTuple(TupleTableSlot *slot);
extern TupleTableSlot *ExecStoreAllNullTuple(TupleTableSlot *slot);
extern HeapTuple ExecCopySlotTuple(TupleTableSlot *slot);
extern MinimalTuple ExecCopySlotMinimalTuple(TupleTableSlot *slot);
+#ifdef PGXC
+#ifdef XCP
+extern RemoteDataRow ExecCopySlotDatarow(TupleTableSlot *slot,
+ MemoryContext tmpcxt);
+#else
+extern int ExecCopySlotDatarow(TupleTableSlot *slot, char **datarow);
+#endif
+#endif
extern HeapTuple ExecFetchSlotTuple(TupleTableSlot *slot);
extern MinimalTuple ExecFetchSlotMinimalTuple(TupleTableSlot *slot);
extern Datum ExecFetchSlotTupleDatum(TupleTableSlot *slot);
diff --git a/src/include/gtm/gtm.h b/src/include/gtm/gtm.h
index 293d67119b..8affafd358 100644
--- a/src/include/gtm/gtm.h
+++ b/src/include/gtm/gtm.h
@@ -3,6 +3,11 @@
* gtm.h
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -92,6 +97,10 @@ void GTM_DoForAllOtherThreads(void (* process_routine)(GTM_ThreadInfo *));
GTM_ThreadInfo *GTM_ThreadCreate(GTM_ConnectionInfo *conninfo,
void *(* startroutine)(void *));
GTM_ThreadInfo * GTM_GetThreadInfo(GTM_ThreadID thrid);
+#ifdef XCP
+extern void SaveControlInfo(void);
+#define CONTROL_INTERVAL 1000
+#endif
/*
* pthread keys to get thread specific information
diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h
index f34f4dbb35..b43f0edaa0 100644
--- a/src/include/gtm/gtm_c.h
+++ b/src/include/gtm/gtm_c.h
@@ -98,7 +98,7 @@ typedef GTM_SequenceKeyData *GTM_SequenceKey;
#define InvalidSequenceValue 0x7fffffffffffffffLL
#define SEQVAL_IS_VALID(v) ((v) != InvalidSequenceValue)
-#define GTM_MAX_GLOBAL_TRANSACTIONS 4096
+#define GTM_MAX_GLOBAL_TRANSACTIONS 16384
typedef enum GTM_IsolationLevel
{
diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h
index e50701a7a7..9d7e500480 100644
--- a/src/include/gtm/gtm_client.h
+++ b/src/include/gtm/gtm_client.h
@@ -3,6 +3,11 @@
* gtm_client.h
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -53,12 +58,15 @@ typedef union GTM_ResultData
{
GTM_SequenceKeyData seqkey;
GTM_Sequence seqval;
- } grd_seq; /* SEQUENCE_GET_NEXT */
-
+#ifdef XCP
+ GTM_Sequence rangemax;
+#endif
+ } grd_seq; /* SEQUENCE_GET_CURRENT
+ * SEQUENCE_GET_NEXT */
struct
{
- int seq_count;
- GTM_SeqInfo **seq;
+ int seq_count;
+ GTM_SeqInfo *seq;
} grd_seq_list; /* SEQUENCE_GET_LIST */
struct
@@ -160,7 +168,7 @@ int end_replication_initial_sync(GTM_Conn *);
size_t get_node_list(GTM_Conn *, GTM_PGXCNodeInfo *, size_t);
GlobalTransactionId get_next_gxid(GTM_Conn *);
uint32 get_txn_gxid_list(GTM_Conn *, GTM_Transactions *);
-size_t get_sequence_list(GTM_Conn *, GTM_SeqInfo **, size_t);
+size_t get_sequence_list(GTM_Conn *, GTM_SeqInfo **);
/*
* Transaction Management API
@@ -248,6 +256,10 @@ int node_unregister(GTM_Conn *conn, GTM_PGXCNodeType type, const char *node_name
int bkup_node_unregister(GTM_Conn *conn, GTM_PGXCNodeType type, const char * node_name);
int backend_disconnect(GTM_Conn *conn, bool is_postmaster, GTM_PGXCNodeType type, char *node_name);
char *node_get_local_addr(GTM_Conn *conn, char *buf, size_t buflen, int *rc);
+#ifdef XCP
+int register_session(GTM_Conn *conn, const char *coord_name, int coord_procid,
+ int coord_backendid);
+#endif
/*
* Sequence Management API
@@ -268,10 +280,26 @@ int close_sequence(GTM_Conn *conn, GTM_SequenceKey key);
int bkup_close_sequence(GTM_Conn *conn, GTM_SequenceKey key);
int rename_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey newkey);
int bkup_rename_sequence(GTM_Conn *conn, GTM_SequenceKey key, GTM_SequenceKey newkey);
+#ifdef XCP
+int get_current(GTM_Conn *conn, GTM_SequenceKey key,
+ char *coord_name, int coord_procid, GTM_Sequence *result);
+int get_next(GTM_Conn *conn, GTM_SequenceKey key,
+ char *coord_name, int coord_procid,
+ GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax);
+int bkup_get_next(GTM_Conn *conn, GTM_SequenceKey key,
+ char *coord_name, int coord_procid,
+ GTM_Sequence range, GTM_Sequence *result, GTM_Sequence *rangemax);
+int set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name,
+ int coord_procid, GTM_Sequence nextval, bool iscalled);
+int bkup_set_val(GTM_Conn *conn, GTM_SequenceKey key, char *coord_name,
+ int coord_procid, GTM_Sequence nextval, bool iscalled);
+#else
+GTM_Sequence get_current(GTM_Conn *conn, GTM_SequenceKey key);
GTM_Sequence get_next(GTM_Conn *conn, GTM_SequenceKey key);
GTM_Sequence bkup_get_next(GTM_Conn *conn, GTM_SequenceKey key);
int set_val(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool is_called);
int bkup_set_val(GTM_Conn *conn, GTM_SequenceKey key, GTM_Sequence nextval, bool is_called);
+#endif
int reset_sequence(GTM_Conn *conn, GTM_SequenceKey key);
int bkup_reset_sequence(GTM_Conn *conn, GTM_SequenceKey key);
diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h
index 64f27bfeeb..560c4428f6 100644
--- a/src/include/gtm/gtm_msg.h
+++ b/src/include/gtm/gtm_msg.h
@@ -3,6 +3,11 @@
* gtm_msg.h
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -27,6 +32,9 @@ typedef enum GTM_MessageType
MSG_BKUP_NODE_REGISTER, /* Backup of MSG_NODE_REGISTER */
MSG_NODE_UNREGISTER, /* Unregister a PGXC Node with GTM */
MSG_BKUP_NODE_UNREGISTER, /* Backup of MSG_NODE_UNREGISTER */
+#ifdef XCP
+ MSG_REGISTER_SESSION, /* Register distributed session with GTM */
+#endif
MSG_NODE_LIST, /* Get node list */
MSG_NODE_BEGIN_REPLICATION_INIT,
MSG_NODE_END_REPLICATION_INIT,
@@ -62,6 +70,7 @@ typedef enum GTM_MessageType
MSG_SNAPSHOT_GXID_GET, /* Get GXID and snapshot together */
MSG_SEQUENCE_INIT, /* Initialize a new global sequence */
MSG_BKUP_SEQUENCE_INIT, /* Backup of MSG_SEQUENCE_INIT */
+ MSG_SEQUENCE_GET_CURRENT,/* Get the current value of sequence */
MSG_SEQUENCE_GET_NEXT, /* Get the next sequence value of sequence */
MSG_BKUP_SEQUENCE_GET_NEXT, /* Backup of MSG_SEQUENCE_GET_NEXT */
MSG_SEQUENCE_GET_LAST, /* Get the last sequence value of sequence */
@@ -99,6 +108,9 @@ typedef enum GTM_ResultType
SYNC_STANDBY_RESULT,
NODE_REGISTER_RESULT,
NODE_UNREGISTER_RESULT,
+#ifdef XCP
+ REGISTER_SESSION_RESULT,
+#endif
NODE_LIST_RESULT,
NODE_BEGIN_REPLICATION_INIT_RESULT,
NODE_END_REPLICATION_INIT_RESULT,
@@ -122,6 +134,7 @@ typedef enum GTM_ResultType
SNAPSHOT_GET_MULTI_RESULT,
SNAPSHOT_GXID_GET_RESULT,
SEQUENCE_INIT_RESULT,
+ SEQUENCE_GET_CURRENT_RESULT,
SEQUENCE_GET_NEXT_RESULT,
SEQUENCE_GET_LAST_RESULT,
SEQUENCE_SET_VAL_RESULT,
diff --git a/src/include/gtm/gtm_seq.h b/src/include/gtm/gtm_seq.h
index af92e6d873..c849dbc884 100644
--- a/src/include/gtm/gtm_seq.h
+++ b/src/include/gtm/gtm_seq.h
@@ -3,6 +3,11 @@
* gtm_seq.h
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -20,12 +25,29 @@
/* Global sequence related structures */
+
+#ifdef XCP
+typedef struct GTM_SeqLastVal
+{
+ char gs_coord_name[SP_NODE_NAME];
+ int32 gs_coord_procid;
+ GTM_Sequence gs_last_value;
+} GTM_SeqLastVal;
+#endif
+
+
typedef struct GTM_SeqInfo
{
GTM_SequenceKey gs_key;
GTM_Sequence gs_value;
GTM_Sequence gs_init_value;
+#ifdef XCP
+ int32 gs_max_lastvals;
+ int32 gs_lastval_count;
+ GTM_SeqLastVal *gs_last_values;
+#else
GTM_Sequence gs_last_value;
+#endif
GTM_Sequence gs_increment_by;
GTM_Sequence gs_min_value;
GTM_Sequence gs_max_value;
@@ -70,12 +92,24 @@ int GTM_SeqAlter(GTM_SequenceKey seqkey,
bool is_restart);
int GTM_SeqClose(GTM_SequenceKey seqkey);
int GTM_SeqRename(GTM_SequenceKey seqkey, GTM_SequenceKey newseqkey);
+#ifdef XCP
+int GTM_SeqGetNext(GTM_SequenceKey seqkey, char *coord_name,
+ int coord_procid, GTM_Sequence range,
+ GTM_Sequence *result, GTM_Sequence *rangemax);
+void GTM_SeqGetCurrent(GTM_SequenceKey seqkey, char *coord_name,
+ int coord_procid, GTM_Sequence *result);
+int GTM_SeqSetVal(GTM_SequenceKey seqkey, char *coord_name,
+ int coord_procid, GTM_Sequence nextval, bool iscalled);
+#else
GTM_Sequence GTM_SeqGetNext(GTM_SequenceKey seqkey);
+GTM_Sequence GTM_SeqGetCurrent(GTM_SequenceKey seqkey);
int GTM_SeqSetVal(GTM_SequenceKey seqkey, GTM_Sequence nextval, bool iscalled);
+#endif
int GTM_SeqReset(GTM_SequenceKey seqkey);
void ProcessSequenceInitCommand(Port *myport, StringInfo message, bool is_backup);
+void ProcessSequenceGetCurrentCommand(Port *myport, StringInfo message);
void ProcessSequenceGetNextCommand(Port *myport, StringInfo message, bool is_backup);
void ProcessSequenceSetValCommand(Port *myport, StringInfo message, bool is_backup);
void ProcessSequenceResetCommand(Port *myport, StringInfo message, bool is_backup);
@@ -97,4 +131,8 @@ int GTM_SeqRestore(GTM_SequenceKey seqkey,
bool cycle,
bool called);
+#ifdef XCP
+void GTM_CleanupSeqSession(char *coord_name, int coord_procid);
+#endif
+
#endif
diff --git a/src/include/gtm/gtm_serialize.h b/src/include/gtm/gtm_serialize.h
index 1c31299281..2cabeb1a5e 100644
--- a/src/include/gtm/gtm_serialize.h
+++ b/src/include/gtm/gtm_serialize.h
@@ -36,11 +36,15 @@ size_t gtm_deserialize_transactions(GTM_Transactions *, const char *, size_t);
size_t gtm_get_pgxcnodeinfo_size(GTM_PGXCNodeInfo *);
size_t gtm_serialize_pgxcnodeinfo(GTM_PGXCNodeInfo *, char *, size_t);
+#ifdef XCP
+size_t gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *, const char *, size_t, PQExpBuffer *);
+#else
size_t gtm_deserialize_pgxcnodeinfo(GTM_PGXCNodeInfo *, const char *, size_t);
+#endif
size_t gtm_get_sequence_size(GTM_SeqInfo *);
size_t gtm_serialize_sequence(GTM_SeqInfo *, char *, size_t);
-GTM_SeqInfo *gtm_deserialize_sequence(const char *, size_t);
+size_t gtm_deserialize_sequence(GTM_SeqInfo *seq, const char *, size_t);
void dump_transactions_elog(GTM_Transactions *, int);
void dump_transactioninfo_elog(GTM_TransactionInfo *);
diff --git a/src/include/gtm/gtm_standby.h b/src/include/gtm/gtm_standby.h
index 448fc49fbc..e9fa57f6bf 100644
--- a/src/include/gtm/gtm_standby.h
+++ b/src/include/gtm/gtm_standby.h
@@ -51,9 +51,8 @@ void gtm_standby_closeActiveConn(void);
void gtm_standby_finishActiveConn(void);
-/* Functions to process backup */
-void ProcessGTMBeginBackup(Port *myport, StringInfo message);
-void ProcessGTMEndBackup(Port *myport, StringInfo message);
+
+
/*
* Startup mode
diff --git a/src/include/gtm/gtm_txn.h b/src/include/gtm/gtm_txn.h
index 942e46ce0e..57a97eb1c9 100644
--- a/src/include/gtm/gtm_txn.h
+++ b/src/include/gtm/gtm_txn.h
@@ -169,11 +169,7 @@ typedef struct GTM_Transactions
extern GTM_Transactions GTMTransactions;
-/*
- * This macro should be used with READ lock held on gt_TransArrayLock as the
- * number of open transactions might change when counting open transactions
- * if a lock is not hold.
- */
+/* NOTE: This macro should be used with READ lock held on gt_TransArrayLock! */
#define GTM_CountOpenTransactions() (gtm_list_length(GTMTransactions.gt_open_transactions))
/*
diff --git a/src/include/gtm/register.h b/src/include/gtm/register.h
index 4d0e99f068..b9cc089952 100644
--- a/src/include/gtm/register.h
+++ b/src/include/gtm/register.h
@@ -3,6 +3,11 @@
* register.h
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -17,7 +22,6 @@
#include "gtm/libpq-be.h"
#include "gtm/gtm_c.h"
#include "gtm/gtm_lock.h"
-#include "gtm/gtm_list.h"
#include "gtm/stringinfo.h"
/*
@@ -39,6 +43,14 @@ typedef enum GTM_PGXCNodeStatus
NODE_DISCONNECTED
} GTM_PGXCNodeStatus;
+#ifdef XCP
+typedef struct GTM_PGXCSession
+{
+ int gps_coord_proc_id;
+ int gps_coord_backend_id;
+} GTM_PGXCSession;
+#endif
+
typedef struct GTM_PGXCNodeInfo
{
GTM_PGXCNodeType type; /* Type of node */
@@ -48,10 +60,16 @@ typedef struct GTM_PGXCNodeInfo
char *ipaddress; /* IP address of the nodes */
char *datafolder; /* Data folder of the node */
GTM_PGXCNodeStatus status; /* Node status */
+#ifdef XCP
+ int max_sessions;
+ int num_sessions;
+ GTM_PGXCSession *sessions;
+#endif
GTM_RWLock node_lock; /* Lock on this structure */
int socket; /* socket number used for registration */
} GTM_PGXCNodeInfo;
+
/* Maximum number of nodes that can be registered */
#define MAX_NODES 1024
@@ -78,6 +96,11 @@ void Recovery_RestoreRegisterInfo(void);
void Recovery_SaveRegisterInfo(void);
void Recovery_PGXCNodeDisconnect(Port *myport);
void Recovery_SaveRegisterFileName(char *dir);
+#ifdef XCP
+int Recovery_PGXCNodeRegisterCoordProcess(char *coord_node, int coord_procid,
+ int coord_backendid);
+void ProcessPGXCRegisterSession(Port *myport, StringInfo message);
+#endif
void ProcessPGXCNodeRegister(Port *myport, StringInfo message, bool is_backup);
void ProcessPGXCNodeUnregister(Port *myport, StringInfo message, bool is_backup);
diff --git a/src/include/libpq/hba.h b/src/include/libpq/hba.h
index f3b8be6a0c..12a526e691 100644
--- a/src/include/libpq/hba.h
+++ b/src/include/libpq/hba.h
@@ -93,4 +93,7 @@ extern int check_usermap(const char *usermap_name,
bool case_sensitive);
extern bool pg_isblank(const char c);
+#ifdef XCP
+extern List* get_parsed_hba(void);
+#endif
#endif /* HBA_H */
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 5ff0856765..f54522308c 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -10,6 +10,11 @@
* Over time, this has also become the preferred place for widely known
* resource-limitation stuff, such as work_mem and check_stack_depth().
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -300,6 +305,10 @@ extern void SetUserIdAndContext(Oid userid, bool sec_def_context);
extern void InitializeSessionUserId(const char *rolename);
extern void InitializeSessionUserIdStandalone(void);
extern void SetSessionAuthorization(Oid userid, bool is_superuser);
+#ifdef XCP
+extern void SetGlobalSession(Oid coordid, int coordpid);
+extern char *GetClusterUserName(void);
+#endif
extern Oid GetCurrentRoleId(void);
extern void SetCurrentRoleId(Oid roleid, bool is_superuser);
@@ -345,7 +354,6 @@ typedef enum ProcessingMode
extern ProcessingMode Mode;
-
#define IsBootstrapProcessingMode() ((bool)(Mode == BootstrapProcessing))
#define IsInitProcessingMode() ((bool)(Mode == InitProcessing))
#define IsNormalProcessingMode() ((bool)(Mode == NormalProcessing))
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index ef1aa2743d..93b5380051 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -4,6 +4,11 @@
* definitions for executor state nodes
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -22,6 +27,9 @@
#include "utils/reltrigger.h"
#include "utils/sortsupport.h"
#include "utils/tuplestore.h"
+#ifdef XCP
+#include "pgxc/squeue.h"
+#endif
/* ----------------
@@ -348,9 +356,11 @@ typedef struct EState
ResultRelInfo *es_result_relations; /* array of ResultRelInfos */
int es_num_result_relations; /* length of array */
ResultRelInfo *es_result_relation_info; /* currently active array elt */
-#ifdef PGXC
+#ifdef PGXC
+#ifndef PGXC
struct PlanState *es_result_remoterel; /* currently active remote rel */
-#endif
+#endif
+#endif
/* Stuff used for firing triggers: */
List *es_trig_target_relations; /* trigger-only ResultRelInfos */
@@ -1061,9 +1071,9 @@ typedef struct ModifyTableState
bool canSetTag; /* do we set the command tag/es_processed? */
bool mt_done; /* are we done? */
PlanState **mt_plans; /* subplans (one per target rel) */
-#ifdef PGXC
+#ifdef PGXC
PlanState **mt_remoterels; /* per-target remote query node */
-#endif
+#endif
int mt_nplans; /* number of plans in the array */
int mt_whichplan; /* which one is being executed (0..n-1) */
ResultRelInfo *resultRelInfo; /* per-subplan target relations */
@@ -1694,7 +1704,9 @@ typedef struct AggState
bool table_filled; /* hash table filled yet? */
TupleHashIterator hashiter; /* for iterating through hash table */
#ifdef PGXC
+#ifndef XCP
bool skip_trans; /* skip the transition step for aggregates */
+#endif /* XCP */
#endif /* PGXC */
} AggState;
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 00d03b9602..2c9cf5ee15 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -4,6 +4,11 @@
* Definitions for tagged nodes.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -82,9 +87,15 @@ typedef enum NodeTag
* TAGS FOR PGXC NODES
* (planner.h, locator.h, nodemgr.h, groupmgr.h)
*/
+#ifdef XCP
+ T_Distribution,
+#endif
T_ExecNodes,
T_SimpleSort,
T_RemoteQuery,
+#ifdef XCP
+ T_RemoteSubplan,
+#endif
T_PGXCNodeHandle,
T_AlterNodeStmt,
T_CreateNodeStmt,
@@ -139,6 +150,9 @@ typedef enum NodeTag
T_LimitState,
#ifdef PGXC
T_RemoteQueryState,
+#ifdef XCP
+ T_RemoteSubplanState,
+#endif
#endif
/*
@@ -261,10 +275,9 @@ typedef enum NodeTag
T_PlaceHolderInfo,
T_MinMaxAggInfo,
T_PlannerParamItem,
-#ifdef PGXC
- T_RemoteQueryPath,
-#endif /* PGXC */
-
+#ifdef XCP
+ T_RemoteSubPath,
+#endif
/*
* TAGS FOR MEMORY NODES (memnodes.h)
*/
@@ -347,6 +360,7 @@ typedef enum NodeTag
T_CheckPointStmt,
#ifdef PGXC
T_BarrierStmt,
+ T_PauseClusterStmt,
#endif
T_CreateSchemaStmt,
T_AlterDatabaseStmt,
@@ -382,6 +396,9 @@ typedef enum NodeTag
T_DropUserMappingStmt,
T_ExecDirectStmt,
T_CleanConnStmt,
+#ifdef XCP
+ T_RemoteStmt,
+#endif
T_AlterTableSpaceOptionsStmt,
T_SecLabelStmt,
T_CreateForeignTableStmt,
@@ -518,11 +535,17 @@ extern PGDLLIMPORT Node *newNodeMacroHolder;
/*
* nodes/{outfuncs.c,print.c}
*/
+#ifdef XCP
+extern void set_portable_output(bool value);
+#endif
extern char *nodeToString(const void *obj);
/*
* nodes/{readfuncs.c,read.c}
*/
+#ifdef XCP
+extern void set_portable_input(bool value);
+#endif
extern void *stringToNode(char *str);
/*
diff --git a/src/include/nodes/params.h b/src/include/nodes/params.h
index 3989006078..a7cdd0d888 100644
--- a/src/include/nodes/params.h
+++ b/src/include/nodes/params.h
@@ -4,6 +4,11 @@
* Support for finding the values associated with Param nodes.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -97,6 +102,9 @@ typedef struct ParamExecData
void *execPlan; /* should be "SubPlanState *" */
Datum value;
bool isnull;
+#ifdef XCP
+ Oid ptype;
+#endif
} ParamExecData;
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index b7b361bc76..e7a4e826c5 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -10,6 +10,11 @@
* the location.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -156,13 +161,17 @@ typedef struct Query
List *constraintDeps; /* a list of pg_constraint OIDs that the query
* depends on to be semantically valid */
#ifdef PGXC
+#ifndef XCP
/* need this info for PGXC Planner, may be temporary */
char *sql_statement; /* original query */
+ bool qry_finalise_aggs; /* used for queries intended for Datanodes,
+ * should Datanode finalise the aggregates? */
bool is_local; /* enforce query execution on local node
* this is used by EXECUTE DIRECT especially. */
bool is_ins_child_sel_parent;/* true if the query is such an INSERT SELECT that
* inserts into a child by selecting from its parent */
#endif
+#endif
} Query;
@@ -713,8 +722,10 @@ typedef struct RangeTblEntry
*/
#ifdef PGXC
+#ifndef XCP
char *relname;
#endif
+#endif
/*
* Fields valid for a plain relation RTE (else zero):
@@ -1253,7 +1264,7 @@ typedef enum AlterTableType
AT_AddNodeList, /* ADD NODE nodelist */
AT_DeleteNodeList, /* DELETE NODE nodelist */
#endif
- AT_GenericOptions /* OPTIONS (...) */
+ AT_GenericOptions, /* OPTIONS (...) */
} AlterTableType;
typedef struct AlterTableCmd /* one subcommand of an ALTER TABLE */
@@ -2420,6 +2431,16 @@ typedef struct VacuumStmt
#ifdef PGXC
/*
+ * ---------------------------
+ * Pause Cluster Statement
+ */
+typedef struct PauseClusterStmt
+{
+ NodeTag type;
+ bool pause; /* will be false to unpause */
+} PauseClusterStmt;
+
+/*
* ----------------------
* Barrier Statement
*/
@@ -2448,6 +2469,7 @@ typedef struct AlterNodeStmt
{
NodeTag type;
char *node_name;
+ bool cluster;
List *options;
} AlterNodeStmt;
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index b7aa20b0c9..644dd18a38 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -4,6 +4,11 @@
* definitions for query plan nodes
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -67,6 +72,19 @@ typedef struct PlannedStmt
List *invalItems; /* other dependencies, as PlanInvalItems */
int nParamExec; /* number of PARAM_EXEC Params used */
+#ifdef XCP
+ int nParamRemote; /* number of params sent from the master mode */
+
+ struct RemoteParam *remoteparams;/* parameter descriptors */
+
+ const char *pname; /* the portal name */
+
+ /* Parameters to filter out result rows */
+ char distributionType;
+ AttrNumber distributionKey;
+ List *distributionNodes;
+ List *distributionRestrict;
+#endif
} PlannedStmt;
/* macro for fetching the Plan associated with a SubPlan node */
@@ -175,9 +193,11 @@ typedef struct ModifyTable
List *returningLists; /* per-target-table RETURNING tlists */
List *rowMarks; /* PlanRowMarks (non-locking only) */
int epqParam; /* ID of Param for EvalPlanQual re-eval */
-#ifdef PGXC
+#ifdef PGXC
+#ifndef XCP
List *remote_plans; /* per-target-table remote node */
-#endif
+#endif
+#endif
} ModifyTable;
/* ----------------
@@ -590,12 +610,6 @@ typedef struct Sort
Oid *sortOperators; /* OIDs of operators to sort them by */
Oid *collations; /* OIDs of collations */
bool *nullsFirst; /* NULLS FIRST/LAST directions */
-#ifdef PGXC
- bool srt_start_merge;/* No need to create the sorted runs. The
- * underlying plan provides those runs. Merge
- * them.
- */
-#endif /* PGXC */
} Sort;
/* ---------------
@@ -633,18 +647,33 @@ typedef enum AggStrategy
AGG_HASHED /* grouped agg, use internal hashtable */
} AggStrategy;
+#ifdef XCP
+typedef enum AggDistribution
+{
+ AGG_ONENODE, /* not distributed aggregation */
+ AGG_SLAVE, /* execute only transient function */
+ AGG_MASTER /* execute collection function as transient
+ * and final finction */
+} AggDistribution;
+#endif
+
typedef struct Agg
{
Plan plan;
AggStrategy aggstrategy;
+#ifdef XCP
+ AggDistribution aggdistribution;
+#endif
int numCols; /* number of grouping columns */
AttrNumber *grpColIdx; /* their indexes in the target list */
Oid *grpOperators; /* equality operators to compare with */
long numGroups; /* estimated number of groups in input */
#ifdef PGXC
+#ifndef XCP
bool skip_trans; /* apply collection directly on the data received
* from remote Datanodes
*/
+#endif /* XCP */
#endif /* PGXC */
} Agg;
diff --git a/src/include/nodes/primnodes.h b/src/include/nodes/primnodes.h
index 211b2cfc12..3ec44c2b8b 100644
--- a/src/include/nodes/primnodes.h
+++ b/src/include/nodes/primnodes.h
@@ -7,6 +7,11 @@
* and join trees.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -244,8 +249,10 @@ typedef struct Aggref
Oid aggcollid; /* OID of collation of result */
Oid inputcollid; /* OID of collation that function should use */
#ifdef PGXC
+#ifndef XCP
Oid aggtrantype; /* type Oid of transition results */
bool agghas_collectfn; /* is collection function available */
+#endif /* XCP */
#endif /* PGXC */
List *args; /* arguments and sort expressions */
List *aggorder; /* ORDER BY (list of SortGroupClause) */
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 964b371517..f036ead7dd 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -4,6 +4,11 @@
* Definitions for planner's internal data structures.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -20,6 +25,25 @@
#include "storage/block.h"
+#ifdef XCP
+/*
+ * Distribution
+ *
+ * Distribution is an attribute of distributed plan node. It describes on which
+ * node execution results can be found.
+ */
+typedef struct Distribution
+{
+ NodeTag type;
+
+ char distributionType;
+ Node *distributionExpr;
+ Bitmapset *nodes;
+ Bitmapset *restrictNodes;
+} Distribution;
+#endif
+
+
/*
* Relids
* Set of relation identifiers (indexes into the rangetable).
@@ -229,6 +253,7 @@ typedef struct PlannerInfo
bool hasRecursion; /* true if planning a recursive WITH item */
#ifdef PGXC
+#ifndef XCP
/* This field is used only when RemoteScan nodes are involved */
int rs_alias_index; /* used to build the alias reference */
@@ -242,6 +267,7 @@ typedef struct PlannerInfo
*/
List *xc_rowMarks; /* list of PlanRowMarks of type ROW_MARK_EXCLUSIVE & ROW_MARK_SHARE */
#endif
+#endif
/* These fields are used only when hasRecursion is true: */
int wt_param_id; /* PARAM_EXEC ID for the work table */
@@ -250,9 +276,20 @@ typedef struct PlannerInfo
/* These fields are workspace for createplan.c */
Relids curOuterRels; /* outer rels above current node */
List *curOuterParams; /* not-yet-assigned NestLoopParams */
+#ifdef XCP
+ Bitmapset *curOuterRestrict; /* Datanodes where outer plan is executed */
+#endif
/* optional private data for join_search_hook, e.g., GEQO */
void *join_search_private;
+#ifdef XCP
+ /*
+ * This is NULL for a SELECT query (NULL distribution means "Coordinator"
+ * everywhere in the planner. For INSERT, UPDATE or DELETE it should match
+ * to the target table distribution.
+ */
+ Distribution *distribution; /* Query result distribution */
+#endif
} PlannerInfo;
@@ -710,6 +747,9 @@ typedef struct Path
List *pathkeys; /* sort ordering of path's output */
/* pathkeys is a List of PathKey nodes; see above */
+#ifdef XCP
+ Distribution *distribution;
+#endif
} Path;
/* Macro for extracting a path's parameterization relids; beware double eval */
@@ -947,6 +987,14 @@ typedef struct UniquePath
List *uniq_exprs; /* expressions to be made unique */
} UniquePath;
+#ifdef XCP
+typedef struct RemoteSubPath
+{
+ Path path;
+ Path *subpath;
+} RemoteSubPath;
+#endif
+
/*
* All join-type paths share these fields.
*/
@@ -1028,45 +1076,6 @@ typedef struct HashPath
int num_batches; /* number of batches expected */
} HashPath;
-#ifdef PGXC
-/*
- * A remotequery path represents the queries to be sent to the datanode/s
- *
- * When RemoteQuery plan is created from RemoteQueryPath, we build the query to
- * be executed at the datanode. For building such a query, it's important to get
- * the RHS relation and LHS relation of the JOIN clause. So, instead of storing
- * the outer and inner paths, we find out the RHS and LHS paths and store those
- * here.
- */
-
-typedef struct RemoteQueryPath
-{
- Path path;
- ExecNodes *rqpath_en; /* List of datanodes to execute the query on */
- /*
- * If the path represents a JOIN rel, leftpath and rightpath represent the
- * RemoteQuery paths for left (outer) and right (inner) side of the JOIN
- * resp. jointype and join_restrictlist pertains to such JOINs.
- */
- struct RemoteQueryPath *leftpath;
- struct RemoteQueryPath *rightpath;
- JoinType jointype;
- List *join_restrictlist; /* restrict list corresponding to JOINs,
- * only considered if rest of
- * the JOIN information is
- * available
- */
- bool rqhas_unshippable_qual; /* TRUE if there is at least
- * one qual which can not be
- * shipped to the datanodes
- */
- bool rqhas_temp_rel; /* TRUE if one of the base relations
- * involved in this path is a temporary
- * table.
- */
-} RemoteQueryPath;
-#endif /* PGXC */
-
/*
* Restriction clause info.
*
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index abc9a69afe..2f40438398 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -4,6 +4,11 @@
* prototypes for costsize.c and clausesel.c.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -26,6 +31,10 @@
#define DEFAULT_CPU_TUPLE_COST 0.01
#define DEFAULT_CPU_INDEX_TUPLE_COST 0.005
#define DEFAULT_CPU_OPERATOR_COST 0.0025
+#ifdef XCP
+#define DEFAULT_NETWORK_BYTE_COST 0.001
+#define DEFAULT_REMOTE_QUERY_COST 100.0
+#endif
#define DEFAULT_EFFECTIVE_CACHE_SIZE 16384 /* measured in pages */
@@ -48,6 +57,10 @@ extern PGDLLIMPORT double random_page_cost;
extern PGDLLIMPORT double cpu_tuple_cost;
extern PGDLLIMPORT double cpu_index_tuple_cost;
extern PGDLLIMPORT double cpu_operator_cost;
+#ifdef XCP
+extern PGDLLIMPORT double network_byte_cost;
+extern PGDLLIMPORT double remote_query_cost;
+#endif
extern PGDLLIMPORT int effective_cache_size;
extern Cost disable_cost;
extern bool enable_seqscan;
@@ -62,11 +75,8 @@ extern bool enable_material;
extern bool enable_mergejoin;
extern bool enable_hashjoin;
#ifdef PGXC
-extern bool enable_fast_query_shipping;
extern bool enable_remotejoin;
extern bool enable_remotegroup;
-extern bool enable_remotesort;
-extern bool enable_remotelimit;
#endif
extern int constraint_exclusion;
@@ -92,7 +102,7 @@ extern void cost_functionscan(Path *path, PlannerInfo *root,
extern void cost_valuesscan(Path *path, PlannerInfo *root,
RelOptInfo *baserel);
#ifdef PGXC
-extern void cost_remotequery(RemoteQueryPath *rqpath, PlannerInfo *root, RelOptInfo *rel);
+extern void cost_remotequery(Path *path, PlannerInfo *root, RelOptInfo *baserel);
#endif
extern void cost_ctescan(Path *path, PlannerInfo *root, RelOptInfo *baserel);
extern void cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm);
@@ -154,6 +164,11 @@ extern void final_cost_hashjoin(PlannerInfo *root, HashPath *path,
extern void cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan);
extern void cost_qual_eval(QualCost *cost, List *quals, PlannerInfo *root);
extern void cost_qual_eval_node(QualCost *cost, Node *qual, PlannerInfo *root);
+#ifdef XCP
+extern void cost_remote_subplan(Path *path,
+ Cost input_startup_cost, Cost input_total_cost,
+ double tuples, int width, int replication);
+#endif
extern void compute_semi_anti_join_factors(PlannerInfo *root,
RelOptInfo *outerrel,
RelOptInfo *innerrel,
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 493256fbe8..2fd43c0cc6 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -4,6 +4,11 @@
* prototypes for pathnode.c, relnode.c.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -67,8 +72,14 @@ extern ResultPath *create_result_path(List *quals);
extern MaterialPath *create_material_path(RelOptInfo *rel, Path *subpath);
extern UniquePath *create_unique_path(PlannerInfo *root, RelOptInfo *rel,
Path *subpath, SpecialJoinInfo *sjinfo);
+#ifdef XCP
+extern Path *create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel,
+ List *pathkeys, Relids required_outer,
+ Distribution *distribution);
+#else
extern Path *create_subqueryscan_path(PlannerInfo *root, RelOptInfo *rel,
List *pathkeys, Relids required_outer);
+#endif
extern Path *create_functionscan_path(PlannerInfo *root, RelOptInfo *rel);
extern Path *create_valuesscan_path(PlannerInfo *root, RelOptInfo *rel);
extern Path *create_ctescan_path(PlannerInfo *root, RelOptInfo *rel);
@@ -78,6 +89,12 @@ extern ForeignPath *create_foreignscan_path(PlannerInfo *root, RelOptInfo *rel,
List *pathkeys,
Relids required_outer,
List *fdw_private);
+#ifdef PGXC
+#ifndef XCP
+extern Path *create_remotequery_path(PlannerInfo *root, RelOptInfo *rel);
+#endif
+#endif
+
extern Relids calc_nestloop_required_outer(Path *outer_path, Path *inner_path);
extern Relids calc_non_nestloop_required_outer(Path *outer_path, Path *inner_path);
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index 50af07e0f9..b6fb8ee5ce 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -85,19 +85,6 @@ extern void add_paths_to_joinrel(PlannerInfo *root, RelOptInfo *joinrel,
JoinType jointype, SpecialJoinInfo *sjinfo,
List *restrictlist);
-#ifdef PGXC
-/*
- * rquerypath.c
- * routines to create RemoteQuery paths
- */
-extern bool create_plainrel_rqpath(PlannerInfo *root, RelOptInfo *rel,
- RangeTblEntry *rte);
-extern void create_joinrel_rqpath(PlannerInfo *root, RelOptInfo *joinrel,
- RelOptInfo *outerrel, RelOptInfo *innerrel,
- List *restrictlist, JoinType jointype,
- SpecialJoinInfo *sjinfo);
-#endif /* PGXC */
-
/*
* joinrels.c
* routines to determine which relations to join
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index ecee00e4e4..39a5650eb6 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -4,6 +4,11 @@
* prototypes for various files in optimizer/plan
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -16,6 +21,9 @@
#include "nodes/plannodes.h"
#include "nodes/relation.h"
+#ifdef XCP
+#include "pgxc/planner.h"
+#endif
/* GUC parameters */
#define DEFAULT_CURSOR_TUPLE_FRACTION 0.1
@@ -129,19 +137,22 @@ extern void extract_query_dependencies(Node *query,
List **invalItems);
#ifdef PGXC
-/*
- * prototypes for plan/pgxcplan.c
- */
-extern Plan *create_remotedml_plan(PlannerInfo *root, Plan *topplan,
- CmdType cmdtyp);
+#ifdef XCP
+extern RemoteSubplan *find_push_down_plan(Plan *plan, bool force);
+extern RemoteSubplan *make_remotesubplan(PlannerInfo *root,
+ Plan *lefttree,
+ Distribution *resultDistribution,
+ Distribution *execDistribution,
+ List *pathkeys);
+#else
+extern Var *search_tlist_for_var(Var *var, List *jtlist);
+extern Plan *create_remoteinsert_plan(PlannerInfo *root, Plan *topplan);
+extern Plan *create_remoteupdate_plan(PlannerInfo *root, Plan *topplan);
+extern Plan *create_remotedelete_plan(PlannerInfo *root, Plan *topplan);
extern Plan *create_remotegrouping_plan(PlannerInfo *root, Plan *local_plan);
-extern Plan *create_remotequery_plan(PlannerInfo *root, RemoteQueryPath *best_path);
-extern Plan *create_remotesort_plan(PlannerInfo *root, Plan *local_plan);
-extern Plan *create_remotelimit_plan(PlannerInfo *root, Plan *local_plan);
-extern List *pgxc_order_qual_clauses(PlannerInfo *root, List *clauses);
-extern List *pgxc_build_relation_tlist(RelOptInfo *rel);
-extern void pgxc_copy_path_costsize(Plan *dest, Path *src);
-extern Plan *pgxc_create_gating_plan(PlannerInfo *root, Plan *plan, List *quals);
-#endif
+/* Expose fix_scan_expr to create_remotequery_plan() */
+extern Node *pgxc_fix_scan_expr(PlannerInfo *root, Node *node, int rtoffset);
+#endif /* XCP */
+#endif /* PGXC */
#endif /* PLANMAIN_H */
diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h
index 1f0993b519..79f71cdd36 100644
--- a/src/include/optimizer/planner.h
+++ b/src/include/optimizer/planner.h
@@ -38,6 +38,10 @@ extern Plan *subquery_planner(PlannerGlobal *glob, Query *parse,
extern bool is_dummy_plan(Plan *plan);
extern Expr *expression_planner(Expr *expr);
+#ifdef PGXC
+extern void GetHashExecNodes(RelationLocInfo *rel_loc_info,
+ ExecNodes **exec_nodes, const Expr *expr);
+#endif
extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid);
diff --git a/src/include/parser/analyze.h b/src/include/parser/analyze.h
index 5fbf520992..dd72351533 100644
--- a/src/include/parser/analyze.h
+++ b/src/include/parser/analyze.h
@@ -4,6 +4,11 @@
* parse analysis for optimizable statements
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -40,4 +45,7 @@ extern void CheckSelectLocking(Query *qry);
extern void applyLockingClause(Query *qry, Index rtindex,
bool forUpdate, bool noWait, bool pushedDown);
+#ifdef XCP
+extern void ParseAnalyze_callback(ParseState *pstate, Query *query);
+#endif
#endif /* ANALYZE_H */
diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 33966987e4..977a5ba999 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -292,6 +292,9 @@ PG_KEYWORD("partial", PARTIAL, UNRESERVED_KEYWORD)
PG_KEYWORD("partition", PARTITION, UNRESERVED_KEYWORD)
PG_KEYWORD("passing", PASSING, UNRESERVED_KEYWORD)
PG_KEYWORD("password", PASSWORD, UNRESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("pause", PAUSE, UNRESERVED_KEYWORD)
+#endif
PG_KEYWORD("placing", PLACING, RESERVED_KEYWORD)
PG_KEYWORD("plans", PLANS, UNRESERVED_KEYWORD)
PG_KEYWORD("position", POSITION, COL_NAME_KEYWORD)
@@ -401,6 +404,9 @@ PG_KEYWORD("unique", UNIQUE, RESERVED_KEYWORD)
PG_KEYWORD("unknown", UNKNOWN, UNRESERVED_KEYWORD)
PG_KEYWORD("unlisten", UNLISTEN, UNRESERVED_KEYWORD)
PG_KEYWORD("unlogged", UNLOGGED, UNRESERVED_KEYWORD)
+#ifdef PGXC
+PG_KEYWORD("unpause", UNPAUSE, UNRESERVED_KEYWORD)
+#endif
PG_KEYWORD("until", UNTIL, UNRESERVED_KEYWORD)
PG_KEYWORD("update", UPDATE, UNRESERVED_KEYWORD)
PG_KEYWORD("user", USER, RESERVED_KEYWORD)
diff --git a/src/include/parser/parse_agg.h b/src/include/parser/parse_agg.h
index b32ee6c272..19fbb01535 100644
--- a/src/include/parser/parse_agg.h
+++ b/src/include/parser/parse_agg.h
@@ -3,6 +3,11 @@
* parse_agg.h
* handle aggregates and window functions in parser
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -27,11 +32,20 @@ extern void parseCheckWindowFuncs(ParseState *pstate, Query *qry);
extern void build_aggregate_fnexprs(Oid *agg_input_types,
int agg_num_inputs,
Oid agg_state_type,
+#ifdef XCP
+ Oid agg_collect_type,
+#endif
Oid agg_result_type,
Oid agg_input_collation,
Oid transfn_oid,
+#ifdef XCP
+ Oid collectfn_oid,
+#endif
Oid finalfn_oid,
Expr **transfnexpr,
+#ifdef XCP
+ Expr **collectfnexpr,
+#endif
Expr **finalfnexpr);
#endif /* PARSE_AGG_H */
diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h
index fdd0db682c..355335ae83 100644
--- a/src/include/parser/parse_utilcmd.h
+++ b/src/include/parser/parse_utilcmd.h
@@ -4,6 +4,11 @@
* parse analysis for utility commands
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -17,8 +22,13 @@
#include "parser/parse_node.h"
-
+#ifdef XCP
+extern bool loose_constraints;
+extern List *transformCreateStmt(CreateStmt *stmt, const char *queryString,
+ bool autodistribute);
+#else
extern List *transformCreateStmt(CreateStmt *stmt, const char *queryString);
+#endif
extern List *transformAlterTableStmt(AlterTableStmt *stmt,
const char *queryString);
extern IndexStmt *transformIndexStmt(IndexStmt *stmt, const char *queryString);
diff --git a/src/include/pg_config.h.win32 b/src/include/pg_config.h.win32
index 5677b6c97f..6a3bdd634d 100644
--- a/src/include/pg_config.h.win32
+++ b/src/include/pg_config.h.win32
@@ -542,13 +542,13 @@
#define MEMSET_LOOP_LIMIT 1024
/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT "[email protected]"
+#define PACKAGE_BUGREPORT "[email protected]"
/* Define to the full name of this package. */
-#define PACKAGE_NAME "Postgres-XC"
+#define PACKAGE_NAME "Postgres-XL"
/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Postgres-XC 1.1devel"
+#define PACKAGE_STRING "Postgres-XL 9.2.0"
/* Define to the version of this package. */
#define PACKAGE_VERSION "9.2beta2"
@@ -560,7 +560,7 @@
#define PG_VERSION_NUM 90200
/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "postgres-xc"
+#define PACKAGE_TARNAME "postgres-xl"
/* Postgres-XC version as a string */
#define PGXC_VERSION "1.1devel"
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index dd978d79c3..fdff029017 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -3,6 +3,11 @@
*
* Definitions for the PostgreSQL statistics collector daemon.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Copyright (c) 2001-2012, PostgreSQL Global Development Group
*
* src/include/pgstat.h
@@ -839,6 +844,11 @@ extern void pgstat_count_heap_insert(Relation rel, int n);
extern void pgstat_count_heap_update(Relation rel, bool hot);
extern void pgstat_count_heap_delete(Relation rel);
extern void pgstat_update_heap_dead_tuples(Relation rel, int delta);
+#ifdef XCP
+extern void pgstat_count_remote_insert(Relation rel, int n);
+extern void pgstat_count_remote_update(Relation rel, int n);
+extern void pgstat_count_remote_delete(Relation rel, int n);
+#endif
extern void pgstat_init_function_usage(FunctionCallInfoData *fcinfo,
PgStat_FunctionCallUsage *fcu);
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 169be003b4..f52bb3181b 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -5,6 +5,11 @@
* Functions to execute commands on multiple Datanodes
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
@@ -18,11 +23,15 @@
#include "locator.h"
#include "nodes/nodes.h"
#include "pgxcnode.h"
+#include "planner.h"
+#ifdef XCP
+#include "squeue.h"
+#include "remotecopy.h"
+#endif
#include "access/tupdesc.h"
#include "executor/tuptable.h"
#include "nodes/execnodes.h"
#include "nodes/pg_list.h"
-#include "optimizer/pgxcplan.h"
#include "tcop/dest.h"
#include "tcop/pquery.h"
#include "utils/snapshot.h"
@@ -38,6 +47,10 @@ extern bool EnforceTwoPhaseCommit;
#define RESPONSE_DATAROW 3
#define RESPONSE_COPY 4
#define RESPONSE_BARRIER_OK 5
+#ifdef XCP
+#define RESPONSE_ERROR 6
+#define RESPONSE_READY 10
+#endif
typedef enum
{
@@ -46,6 +59,10 @@ typedef enum
REQUEST_TYPE_QUERY, /* Row description response */
REQUEST_TYPE_COPY_IN, /* Copy In response */
REQUEST_TYPE_COPY_OUT /* Copy Out response */
+#ifdef XCP
+ ,
+ REQUEST_TYPE_ERROR /* Error, ignore responses */
+#endif
} RequestType;
/*
@@ -66,6 +83,8 @@ typedef struct CombineTag
char data[COMPLETION_TAG_BUFSIZE]; /* execution result combination data */
} CombineTag;
+
+#ifndef XCP
/*
* Represents a DataRow message received from a remote node.
* Contains originating node number and message body in DataRow format without
@@ -78,8 +97,18 @@ typedef struct RemoteDataRowData
int msgnode; /* node number of the data row message */
} RemoteDataRowData;
typedef RemoteDataRowData *RemoteDataRow;
+#endif
+#ifdef XCP
+/*
+ * Common part for all plan state nodes needed to access remote datanodes
+ * ResponseCombiner must be the first field of the plan state node so we can
+ * typecast
+ */
+typedef struct ResponseCombiner
+#else
typedef struct RemoteQueryState
+#endif
{
ScanState ss; /* its first field is NodeTag */
int node_count; /* total count of participating nodes */
@@ -93,54 +122,157 @@ typedef struct RemoteQueryState
int description_count; /* count of received RowDescription messages */
int copy_in_count; /* count of received CopyIn messages */
int copy_out_count; /* count of received CopyOut messages */
+ FILE *copy_file; /* used if copy_dest == COPY_FILE */
+ uint64 processed; /* count of data rows handled */
char errorCode[5]; /* error code to send back to client */
char *errorMessage; /* error message to send back to client */
char *errorDetail; /* error detail to send back to client */
- bool query_Done; /* query has been sent down to Datanodes */
+#ifdef XCP
+ Oid returning_node; /* returning replicated node */
+ RemoteDataRow currentRow; /* next data ro to be wrapped into a tuple */
+#else
RemoteDataRowData currentRow; /* next data ro to be wrapped into a tuple */
+#endif
/* TODO use a tuplestore as a rowbuffer */
List *rowBuffer; /* buffer where rows are stored when connection
* should be cleaned for reuse by other RemoteQuery */
+#ifdef XCP
+ /*
+ * To handle special case - if there is a simple sort and sort connection
+ * is buffered. If EOF is reached on a connection it should be removed from
+ * the array, but we need to know node number of the connection to find
+ * messages in the buffer. So we store nodenum to that array if reach EOF
+ * when buffering
+ */
+ Oid *tapenodes;
+ /*
+ * If some tape (connection) is buffered, contains a reference on the cell
+ * right before first row buffered from this tape, needed to speed up
+ * access to the data
+ */
+ ListCell **tapemarks;
+ bool merge_sort; /* perform mergesort of node tuples */
+ bool extended_query; /* running extended query protocol */
+ bool probing_primary; /* trying replicated on primary node */
+#else
/*
- * To handle special case - if this RemoteQuery is feeding sorted data to
- * Sort plan and if the connection fetching data from the Datanode
+ * To handle special case - if there is a simple sort and sort connection
* is buffered. If EOF is reached on a connection it should be removed from
* the array, but we need to know node number of the connection to find
* messages in the buffer. So we store nodenum to that array if reach EOF
* when buffering
*/
int *tapenodes;
- RemoteCopyType remoteCopyType; /* Type of remote COPY operation */
- FILE *copy_file; /* used if remoteCopyType == REMOTE_COPY_FILE */
- uint64 processed; /* count of data rows when running CopyOut */
+#endif
+ void *tuplesortstate; /* for merge sort */
+ /* COPY support */
+ RemoteCopyType remoteCopyType;
+ Tuplestorestate *tuplestorestate;
/* cursor support */
char *cursor; /* cursor name */
char *update_cursor; /* throw this cursor current tuple can be updated */
int cursor_count; /* total count of participating nodes */
- PGXCNodeHandle **cursor_connections;/* Datanode connections being combined */
+ PGXCNodeHandle **cursor_connections;/* data node connections being combined */
+#ifdef XCP
+} ResponseCombiner;
+
+typedef struct RemoteQueryState
+{
+ ResponseCombiner combiner; /* see ResponseCombiner struct */
+#endif
+ bool query_Done; /* query has been sent down to Datanodes */
+ /*
+ * While we are not supporting grouping use this flag to indicate we need
+ * to initialize collecting of aggregates from the DNs
+ */
+ bool initAggregates;
+ /* Simple DISTINCT support */
+ FmgrInfo *eqfunctions; /* functions to compare tuples */
+ MemoryContext tmp_ctx; /* separate context is needed to compare tuples */
/* Support for parameters */
char *paramval_data; /* parameter data, format is like in BIND */
int paramval_len; /* length of parameter values data */
- Oid *rqs_param_types; /* Types of the remote params */
- int rqs_num_params;
int eflags; /* capability flags to pass to tuplestore */
bool eof_underlying; /* reached end of underlying plan? */
- Tuplestorestate *tuplestorestate;
+#ifndef XCP
CommandId rqs_cmd_id; /* Cmd id to use in some special cases */
- int rqs_tapenum; /* Connection from which to fetch next row,
- * in case of Sorting */
- TupleTableSlot *rqs_tapedata; /* Data received from this connection to be
- * buffered between getlen and readtup calls
- * for sort */
- bool rqs_for_sort; /* The row fetches will be handled by Sort */
- bool non_fqs_dml; /* true if this is a non fast query shipped DML
- * For detailed discussion on why this variable
- * is required see comments in ExecProcNodeDMLInXC */
+#endif
} RemoteQueryState;
+
+#ifdef XCP
+typedef struct RemoteParam
+{
+ ParamKind paramkind; /* kind of parameter */
+ int paramid; /* numeric ID for parameter */
+ Oid paramtype; /* pg_type OID of parameter's datatype */
+} RemoteParam;
+
+
+/*
+ * Execution state of a RemoteSubplan node
+ */
+typedef struct RemoteSubplanState
+{
+ ResponseCombiner combiner; /* see ResponseCombiner struct */
+ char *subplanstr; /* subplan encoded as a string */
+ bool bound; /* subplan is sent down to the nodes */
+ bool local_exec; /* execute subplan on this datanode */
+ Locator *locator; /* determine destination of tuples of
+ * locally executed plan */
+ int *dest_nodes; /* allocate once */
+ List *execNodes; /* where to execute subplan */
+ /* should query be executed on all (true) or any (false) node specified
+ * in the execNodes list */
+ bool execOnAll;
+ int nParamRemote; /* number of params sent from the master node */
+ RemoteParam *remoteparams; /* parameter descriptors */
+} RemoteSubplanState;
+
+
+/*
+ * Data needed to set up a PreparedStatement on the remote node and other data
+ * for the remote executor
+ */
+typedef struct RemoteStmt
+{
+ NodeTag type;
+
+ CmdType commandType; /* select|insert|update|delete */
+
+ bool hasReturning; /* is it insert|update|delete RETURNING? */
+
+ struct Plan *planTree; /* tree of Plan nodes */
+
+ List *rtable; /* list of RangeTblEntry nodes */
+
+ /* rtable indexes of target relations for INSERT/UPDATE/DELETE */
+ List *resultRelations; /* integer list of RT indexes, or NIL */
+
+ List *subplans; /* Plan trees for SubPlan expressions */
+
+ int nParamExec; /* number of PARAM_EXEC Params used */
+
+ int nParamRemote; /* number of params sent from the master node */
+
+ RemoteParam *remoteparams; /* parameter descriptors */
+
+ List *rowMarks;
+
+ char distributionType;
+
+ AttrNumber distributionKey;
+
+ List *distributionNodes;
+
+ List *distributionRestrict;
+} RemoteStmt;
+#endif
+
typedef void (*xact_callback) (bool isCommit, void *args);
+#ifndef XCP
/* Multinode Executor */
extern void PGXCNodeBegin(void);
extern void PGXCNodeSetBeginQuery(char *query_string);
@@ -149,51 +281,104 @@ extern int PGXCNodeRollback(void);
extern bool PGXCNodePrepare(char *gid);
extern bool PGXCNodeRollbackPrepared(char *gid);
extern void PGXCNodeCommitPrepared(char *gid);
+#endif
+
/* Copy command just involves Datanodes */
+#ifdef XCP
+extern void DataNodeCopyBegin(RemoteCopyData *rcstate);
+extern int DataNodeCopyIn(char *data_row, int len, int conn_count,
+ PGXCNodeHandle** copy_connections);
+extern uint64 DataNodeCopyOut(PGXCNodeHandle** copy_connections,
+ int conn_count, FILE* copy_file);
+extern uint64 DataNodeCopyStore(PGXCNodeHandle** copy_connections,
+ int conn_count, Tuplestorestate* store);
+extern void DataNodeCopyFinish(int conn_count, PGXCNodeHandle** connections);
+extern int DataNodeCopyInBinaryForAll(char *msg_buf, int len, int conn_count,
+ PGXCNodeHandle** connections);
+#else
extern PGXCNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot);
extern int DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections);
extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, TupleDesc tupleDesc,
FILE* copy_file, Tuplestorestate *store, RemoteCopyType remoteCopyType);
extern void DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, CombineType combine_type);
-extern bool DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error);
extern int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_connections);
+#endif
+extern bool DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error);
+#ifndef XCP
extern int ExecCountSlotsRemoteQuery(RemoteQuery *node);
+#endif
extern RemoteQueryState *ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags);
extern TupleTableSlot* ExecRemoteQuery(RemoteQueryState *step);
extern void ExecEndRemoteQuery(RemoteQueryState *step);
+#ifdef XCP
+extern void RemoteSubplanMakeUnique(Node *plan, int unique);
+extern RemoteSubplanState *ExecInitRemoteSubplan(RemoteSubplan *node, EState *estate, int eflags);
+extern void ExecFinishInitRemoteSubplan(RemoteSubplanState *node);
+extern TupleTableSlot* ExecRemoteSubplan(RemoteSubplanState *node);
+extern void ExecEndRemoteSubplan(RemoteSubplanState *node);
+extern void ExecReScanRemoteSubplan(RemoteSubplanState *node);
+#endif
extern void ExecRemoteUtility(RemoteQuery *node);
-extern int handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner);
extern bool is_data_node_ready(PGXCNodeHandle * conn);
-extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body, size_t len);
+
+#ifdef XCP
+extern int handle_response(PGXCNodeHandle *conn, ResponseCombiner *combiner);
+#else
+extern int handle_response(PGXCNodeHandle *conn, RemoteQueryState *combiner);
+#endif
+extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body,
+ size_t len);
+
+#ifdef XCP
+#define CHECK_OWNERSHIP(conn, node) \
+ do { \
+ if ((conn)->state == DN_CONNECTION_STATE_QUERY && \
+ (conn)->combiner && \
+ (conn)->combiner != (ResponseCombiner *) (node)) \
+ BufferConnection(conn); \
+ (conn)->combiner = (ResponseCombiner *) (node); \
+ } while(0)
+
+extern TupleTableSlot *FetchTuple(ResponseCombiner *combiner);
+extern void InitResponseCombiner(ResponseCombiner *combiner, int node_count,
+ CombineType combine_type);
+extern void CloseCombiner(ResponseCombiner *combiner);
+#else
extern bool FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot);
+#endif
extern void BufferConnection(PGXCNodeHandle *conn);
extern void ExecRemoteQueryReScan(RemoteQueryState *node, ExprContext *exprCtxt);
-extern void SetDataRowForExtParams(ParamListInfo params, RemoteQueryState *rq_state);
+extern int ParamListToDataRow(ParamListInfo params, char** result);
extern void ExecCloseRemoteStatement(const char *stmt_name, List *nodelist);
-extern void PreCommit_Remote(char *prepareGID, bool preparedLocalNode);
extern char *PrePrepare_Remote(char *prepareGID, bool localNode, bool implicit);
+#ifdef XCP
+extern void PostPrepare_Remote(char *prepareGID, bool implicit);
+extern void PreCommit_Remote(char *prepareGID, char *nodestring, bool preparedLocalNode);
+#else
extern void PostPrepare_Remote(char *prepareGID, char *nodestring, bool implicit);
+extern void PreCommit_Remote(char *prepareGID, bool preparedLocalNode);
+#endif
extern bool PreAbort_Remote(void);
extern void AtEOXact_Remote(void);
extern bool IsTwoPhaseCommitRequired(bool localWrite);
extern bool FinishRemotePreparedTransaction(char *prepareGID, bool commit);
+#ifndef XCP
/* Flags related to temporary objects included in query */
extern void ExecSetTempObjectIncluded(void);
extern bool ExecIsTempObjectIncluded(void);
-extern TupleTableSlot *ExecProcNodeDMLInXC(RemoteQueryState *resultRemoteRel,
- TupleTableSlot *slot);
+extern void ExecRemoteQueryStandard(Relation resultRelationDesc, RemoteQueryState *resultRemoteRel, TupleTableSlot *slot);
extern void pgxc_all_success_nodes(ExecNodes **d_nodes, ExecNodes **c_nodes, char **failednodes_msg);
extern void AtEOXact_DBCleanup(bool isCommit);
extern void set_dbcleanup_callback(xact_callback function, void *paraminfo, int paraminfo_size);
-extern void do_query(RemoteQueryState *node);
#endif
+#endif
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
index 43ee425c25..145028f962 100644
--- a/src/include/pgxc/locator.h
+++ b/src/include/pgxc/locator.h
@@ -13,9 +13,13 @@
#ifndef LOCATOR_H
#define LOCATOR_H
+#ifdef XCP
+#include "fmgr.h"
+#endif
#define LOCATOR_TYPE_REPLICATED 'R'
#define LOCATOR_TYPE_HASH 'H'
#define LOCATOR_TYPE_RANGE 'G'
+#define LOCATOR_TYPE_SINGLE 'S'
#define LOCATOR_TYPE_RROBIN 'N'
#define LOCATOR_TYPE_CUSTOM 'C'
#define LOCATOR_TYPE_MODULO 'M'
@@ -43,6 +47,8 @@
#include "nodes/primnodes.h"
#include "utils/relcache.h"
+typedef int PartAttrNumber;
+
/*
* How relation is accessed in the query
*/
@@ -56,16 +62,14 @@ typedef enum
typedef struct
{
- Oid relid; /* OID of relation */
- char locatorType; /* locator type, see above */
- AttrNumber partAttrNum; /* Distribution column attribute */
- List *nodeList; /* Node indices where data is located */
- ListCell *roundRobinNode; /* Index of the next node to use */
+ Oid relid;
+ char locatorType;
+ PartAttrNumber partAttrNum; /* if partitioned */
+ char *partAttrName; /* if partitioned */
+ List *nodeList; /* Node Indices */
+ ListCell *roundRobinNode; /* index of the next one to use */
} RelationLocInfo;
-#define IsRelationReplicated(rel_loc) IsLocatorReplicated((rel_loc)->locatorType)
-#define IsRelationColumnDistributed(rel_loc) IsLocatorColumnDistributed((rel_loc)->locatorType)
-#define IsRelationDistributedByValue(rel_loc) IsLocatorDistributedByValue((rel_loc)->locatorType)
/*
* Nodes to execute on
* primarynodelist is for replicated table writes, where to execute first.
@@ -75,52 +79,113 @@ typedef struct
typedef struct
{
NodeTag type;
- List *primarynodelist; /* Primary node list indexes */
- List *nodeList; /* Node list indexes */
- char baselocatortype; /* Locator type, see above */
- Expr *en_expr; /* Expression to evaluate at execution time
- * if planner can not determine execution
- * nodes */
- Oid en_relid; /* Relation to determine execution nodes */
- RelationAccessType accesstype; /* Access type to determine execution
- * nodes */
+ List *primarynodelist;
+ List *nodeList;
+ char baselocatortype;
+ Expr *en_expr; /* expression to evaluate at execution time if planner
+ * can not determine execution nodes */
+ Oid en_relid; /* Relation to determine execution nodes */
+ RelationAccessType accesstype; /* Access type to determine execution nodes */
} ExecNodes;
-#define IsExecNodesReplicated(en) IsLocatorReplicated((en)->baselocatortype)
-#define IsExecNodesColumnDistributed(en) IsLocatorColumnDistributed((en)->baselocatortype)
-#define IsExecNodesDistributedByValue(en) IsLocatorDistributedByValue((en)->baselocatortype)
+
+#ifdef XCP
+typedef enum
+{
+ LOCATOR_LIST_NONE, /* locator returns integers in range 0..NodeCount-1,
+ * value of nodeList ignored and can be NULL */
+ LOCATOR_LIST_INT, /* nodeList is an integer array (int *), value from
+ * the array is returned */
+ LOCATOR_LIST_OID, /* node list is an array of Oids (Oid *), value from
+ * the array is returned */
+ LOCATOR_LIST_POINTER, /* node list is an array of pointers (void **),
+ * value from the array is returned */
+ LOCATOR_LIST_LIST, /* node list is a list, item type is determined by
+ * list type (integer, oid or pointer). NodeCount
+ * is ignored */
+} LocatorListType;
+
+typedef Datum (*LocatorHashFunc) (PG_FUNCTION_ARGS);
+
+typedef struct _Locator Locator;
+
+
+/*
+ * Creates a structure holding necessary info to effectively determine nodes
+ * where a tuple should be stored.
+ * Locator does not allocate memory while working, all allocations are made at
+ * the creation time.
+ *
+ * Parameters:
+ *
+ * locatorType - see LOCATOR_TYPE_* constants
+ * accessType - see RelationAccessType enum
+ * dataType - actual data type of values provided to determine nodes
+ * listType - defines how nodeList parameter is interpreted, see
+ * LocatorListType enum for more details
+ * nodeCount - number of nodes to distribute
+ * nodeList - detailed info about relation nodes. Either List or array or NULL
+ * result - returned address of the array where locator will output node
+ * references. Type of array items (int, Oid or pointer (void *))
+ * depends on listType.
+ * primary - set to true if caller ever wants to determine primary node.
+ * Primary node will be returned as the first element of the
+ * result array
+ */
+extern Locator *createLocator(char locatorType, RelationAccessType accessType,
+ Oid dataType, LocatorListType listType, int nodeCount,
+ void *nodeList, void **result, bool primary);
+extern void freeLocator(Locator *locator);
+
+extern int GET_NODES(Locator *self, Datum value, bool isnull, bool *hasprimary);
+extern void *getLocatorResults(Locator *self);
+extern void *getLocatorNodeMap(Locator *self);
+extern int getLocatorNodeCount(Locator *self);
+#endif
/* Extern variables related to locations */
extern Oid primary_data_node;
extern Oid preferred_data_node[MAX_PREFERRED_NODES];
extern int num_preferred_data_nodes;
-/* Function for RelationLocInfo building and management */
-extern void RelationBuildLocator(Relation rel);
-extern RelationLocInfo *GetRelationLocInfo(Oid relid);
-extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo *srcInfo);
-extern void FreeRelationLocInfo(RelationLocInfo *relationLocInfo);
-extern char *GetRelationDistribColumn(RelationLocInfo *locInfo);
+extern void InitRelationLocInfo(void);
extern char GetLocatorType(Oid relid);
-extern List *GetPreferredReplicationNode(List *relNodes);
-extern bool IsTableDistOnPrimary(RelationLocInfo *locInfo);
-extern bool IsLocatorInfoEqual(RelationLocInfo *locInfo1,
- RelationLocInfo *locInfo2);
-extern int GetRoundRobinNode(Oid relid);
-extern bool IsTypeDistributable(Oid colType);
-extern bool IsDistribColumn(Oid relid, AttrNumber attNum);
-extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info,
- Datum valueForDistCol,
- bool isValueNull,
- Oid typeOfValueForDistCol,
- RelationAccessType accessType);
-extern ExecNodes *GetRelationNodesByQuals(Oid reloid,
- Index varno,
- Node *quals,
- RelationAccessType relaccess);
-/* Global locator data */
-extern void FreeExecNodes(ExecNodes **exec_nodes);
+extern char ConvertToLocatorType(int disttype);
+
+extern char *GetRelationHashColumn(RelationLocInfo *rel_loc_info);
+extern RelationLocInfo *GetRelationLocInfo(Oid relid);
+extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo *src_info);
+extern char GetRelationLocType(Oid relid);
+extern bool IsTableDistOnPrimary(RelationLocInfo *rel_loc_info);
+extern bool IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2);
+#ifndef XCP
+extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
+ bool isValueNull, Oid typeOfValueForDistCol,
+ RelationAccessType accessType);
+extern ExecNodes *GetRelationNodesByQuals(Oid reloid, Index varno, Node *quals,
+ RelationAccessType relaccess);
+#endif
+extern bool IsHashColumn(RelationLocInfo *rel_loc_info, char *part_col_name);
+extern bool IsHashColumnForRelId(Oid relid, char *part_col_name);
+extern int GetRoundRobinNode(Oid relid);
+
+extern bool IsTypeHashDistributable(Oid col_type);
extern List *GetAllDataNodes(void);
extern List *GetAllCoordNodes(void);
+#ifdef XCP
+extern int GetAnyDataNode(Bitmapset *nodes);
+#else
+extern List *GetPreferredReplicationNode(List *relNodes);
+#endif
+extern void RelationBuildLocator(Relation rel);
+extern void FreeRelationLocInfo(RelationLocInfo *relationLocInfo);
+
+extern bool IsTypeModuloDistributable(Oid col_type);
+extern char *GetRelationModuloColumn(RelationLocInfo *rel_loc_info);
+extern bool IsModuloColumn(RelationLocInfo *rel_loc_info, char *part_col_name);
+extern bool IsModuloColumnForRelId(Oid relid, char *part_col_name);
+extern char *GetRelationDistColumn(RelationLocInfo *rel_loc_info);
+extern bool IsDistColumnForRelId(Oid relid, char *part_col_name);
+extern void FreeExecNodes(ExecNodes **exec_nodes);
#endif /* LOCATOR_H */
diff --git a/src/include/pgxc/pause.h b/src/include/pgxc/pause.h
new file mode 100644
index 0000000000..1ed26ac555
--- /dev/null
+++ b/src/include/pgxc/pause.h
@@ -0,0 +1,38 @@
+/*-------------------------------------------------------------------------
+ *
+ * pause.h
+ *
+ * Definitions for the Pause/Unpause Cluster handling
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef PAUSE_H
+#define PAUSE_H
+
+#include "storage/s_lock.h"
+
+/* Shared memory area for management of cluster pause/unpause */
+typedef struct {
+ int cl_holder_pid; /* pid of the process issuing CLUSTER PAUSE */
+ int cl_process_count; /* Number of processes undergoing txns */
+
+ slock_t cl_mutex; /* locks shared variables mentioned above */
+} ClusterLockInfo;
+
+extern ClusterLockInfo *ClustLinfo;
+
+extern bool cluster_lock_held;
+extern bool cluster_ex_lock_held;
+
+extern void ClusterLockShmemInit(void);
+extern Size ClusterLockShmemSize(void);
+extern void AcquireClusterLock(bool exclusive);
+extern void ReleaseClusterLock(bool exclusive);
+
+extern void RequestClusterPause(bool pause, char *completionTag);
+extern void PGXCCleanClusterLock(int code, Datum arg);
+#endif
diff --git a/src/include/pgxc/pgxc.h b/src/include/pgxc/pgxc.h
index 21cd9c6beb..60c0d138b9 100644
--- a/src/include/pgxc/pgxc.h
+++ b/src/include/pgxc/pgxc.h
@@ -4,6 +4,11 @@
* Postgres-XC flags and connection control information
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2011 PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
@@ -15,9 +20,14 @@
#define PGXC_H
#include "storage/lwlock.h"
+#include "postgres.h"
extern bool isPGXCCoordinator;
extern bool isPGXCDataNode;
+extern bool isRestoreMode;
+#ifdef XCP
+extern char *parentPGXCNode;
+#endif
typedef enum
{
@@ -36,8 +46,14 @@ extern char *PGXCNodeName;
extern int PGXCNodeId;
extern uint32 PGXCNodeIdentifier;
+extern Datum xc_lockForBackupKey1;
+extern Datum xc_lockForBackupKey2;
+
#define IS_PGXC_COORDINATOR isPGXCCoordinator
#define IS_PGXC_DATANODE isPGXCDataNode
+#ifdef XCP
+#define PGXC_PARENT_NODE parentPGXCNode
+#endif
#define REMOTE_CONN_TYPE remoteConnType
#define IsConnFromApp() (remoteConnType == REMOTE_CONN_APP)
@@ -45,4 +61,9 @@ extern uint32 PGXCNodeIdentifier;
#define IsConnFromDatanode() (remoteConnType == REMOTE_CONN_DATANODE)
#define IsConnFromGtm() (remoteConnType == REMOTE_CONN_GTM)
#define IsConnFromGtmProxy() (remoteConnType == REMOTE_CONN_GTM_PROXY)
-#endif /* PGXC_H */
+
+/* key pair to be used as object id while using advisory lock for backup */
+#define XC_LOCK_FOR_BACKUP_KEY_1 0xFFFF
+#define XC_LOCK_FOR_BACKUP_KEY_2 0xFFFF
+
+#endif /* PGXC */
diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h
index 31f973cbca..621e4a9a45 100644
--- a/src/include/pgxc/pgxcnode.h
+++ b/src/include/pgxc/pgxcnode.h
@@ -5,6 +5,11 @@
* Utility functions to communicate to Datanodes and Coordinators
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group ?
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
@@ -24,7 +29,6 @@
#define NO_SOCKET -1
-
/* Connection to Datanode maintained by Pool Manager */
typedef struct PGconn NODE_CONNECTION;
typedef struct PGcancel NODE_CANCEL;
@@ -34,6 +38,7 @@ typedef enum
{
DN_CONNECTION_STATE_IDLE, /* idle, ready for query */
DN_CONNECTION_STATE_QUERY, /* query is sent, response expected */
+ DN_CONNECTION_STATE_CLOSE, /* close is sent, confirmation expected */
DN_CONNECTION_STATE_ERROR_FATAL, /* fatal error */
DN_CONNECTION_STATE_COPY_IN,
DN_CONNECTION_STATE_COPY_OUT
@@ -46,6 +51,7 @@ typedef enum
HANDLE_DEFAULT
} PGXCNode_HandleRequested;
+#ifndef XCP
/*
* Enumeration for two purposes
* 1. To indicate to the HandleCommandComplete function whether response checking is required or not
@@ -64,7 +70,7 @@ typedef enum
RESP_ROLLBACK_RECEIVED, /* Response is ROLLBACK */
RESP_ROLLBACK_NOT_RECEIVED /* Response is NOT ROLLBACK */
}RESP_ROLLBACK;
-
+#endif
#define DN_CONNECTION_STATE_ERROR(dnconn) \
((dnconn)->state == DN_CONNECTION_STATE_ERROR_FATAL \
@@ -83,7 +89,12 @@ struct pgxc_node_handle
/* Connection state */
char transaction_status;
DNConnectionState state;
+#ifdef XCP
+ bool read_only;
+ struct ResponseCombiner *combiner;
+#else
struct RemoteQueryState *combiner;
+#endif
#ifdef DN_CONNECTION_DEBUG
bool have_row_desc;
#endif
@@ -98,14 +109,17 @@ struct pgxc_node_handle
size_t inStart;
size_t inEnd;
size_t inCursor;
-
/*
* Have a variable to enable/disable response checking and
* if enable then read the result of response checking
*
* For details see comments of RESP_ROLLBACK
*/
+#ifdef XCP
+ bool ck_resp_rollback;
+#else
RESP_ROLLBACK ck_resp_rollback;
+#endif
};
typedef struct pgxc_node_handle PGXCNodeHandle;
@@ -122,27 +136,46 @@ typedef struct
extern void InitMultinodeExecutor(bool is_force);
/* Open/close connection routines (invoked from Pool Manager) */
+#ifdef XCP
+extern char *PGXCNodeConnStr(char *host, int port, char *dbname, char *user,
+ char *remote_type, char *parent_node);
+#else
extern char *PGXCNodeConnStr(char *host, int port, char *dbname, char *user,
char *pgoptions, char *remote_type);
+#endif
extern NODE_CONNECTION *PGXCNodeConnect(char *connstr);
+#ifndef XCP
extern int PGXCNodeSendSetQuery(NODE_CONNECTION *conn, const char *sql_command);
+#endif
extern void PGXCNodeClose(NODE_CONNECTION * conn);
extern int PGXCNodeConnected(NODE_CONNECTION * conn);
extern int PGXCNodeConnClean(NODE_CONNECTION * conn);
extern void PGXCNodeCleanAndRelease(int code, Datum arg);
+#ifdef XCP
+extern PGXCNodeHandle *get_any_handle(List *datanodelist);
+#endif
/* Look at information cached in node handles */
+#ifdef XCP
+extern int PGXCNodeGetNodeId(Oid nodeoid, char *node_type);
+extern int PGXCNodeGetNodeIdFromName(char *node_name, char *node_type);
+#else
extern int PGXCNodeGetNodeId(Oid nodeoid, char node_type);
-extern Oid PGXCNodeGetNodeOid(int nodeid, char node_type);
extern int PGXCNodeGetNodeIdFromName(char *node_name, char node_type);
+#endif
+extern Oid PGXCNodeGetNodeOid(int nodeid, char node_type);
extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist, bool is_query_coord_only);
+#ifdef XCP
+extern PGXCNodeAllHandles *get_current_handles(void);
+#endif
extern void pfree_pgxc_all_handles(PGXCNodeAllHandles *handles);
extern void release_handles(void);
+#ifndef XCP
extern void cancel_query(void);
extern void clear_all_data(void);
-
+#endif
extern int get_transaction_nodes(PGXCNodeHandle ** connections,
char client_conn_type,
@@ -171,6 +204,11 @@ extern int pgxc_node_send_query_extended(PGXCNodeHandle *handle, const char *que
int num_params, Oid *param_types,
int paramlen, char *params,
bool send_describe, int fetch_size);
+#ifdef XCP
+extern int pgxc_node_send_plan(PGXCNodeHandle * handle, const char *statement,
+ const char *query, const char *planstr,
+ short num_params, Oid *param_types);
+#endif
extern int pgxc_node_send_gxid(PGXCNodeHandle * handle, GlobalTransactionId gxid);
extern int pgxc_node_send_cmd_id(PGXCNodeHandle *handle, CommandId cid);
extern int pgxc_node_send_snapshot(PGXCNodeHandle * handle, Snapshot snapshot);
@@ -185,8 +223,10 @@ extern int send_some(PGXCNodeHandle * handle, int len);
extern int pgxc_node_flush(PGXCNodeHandle *handle);
extern void pgxc_node_flush_read(PGXCNodeHandle *handle);
+#ifndef XCP
extern int pgxc_all_handles_send_gxid(PGXCNodeAllHandles *pgxc_handles, GlobalTransactionId gxid, bool stop_at_error);
extern int pgxc_all_handles_send_query(PGXCNodeAllHandles *pgxc_handles, const char *buffer, bool stop_at_error);
+#endif
extern char get_message(PGXCNodeHandle *conn, int *len, char **msg);
@@ -194,4 +234,13 @@ extern void add_error_message(PGXCNodeHandle * handle, const char *message);
extern Datum pgxc_execute_on_nodes(int numnodes, Oid *nodelist, char *query);
+#ifdef XCP
+extern void PGXCNodeSetParam(bool local, const char *name, const char *value);
+extern void PGXCNodeResetParams(bool only_local);
+extern char *PGXCNodeGetSessionParamStr(void);
+extern char *PGXCNodeGetTransactionParamStr(void);
+extern void pgxc_node_set_query(PGXCNodeHandle *handle, const char *set_query);
+extern void RequestInvalidateRemoteHandles(void);
+#endif
+
#endif /* PGXCNODE_H */
diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h
new file mode 100644
index 0000000000..6ee83fb0c0
--- /dev/null
+++ b/src/include/pgxc/planner.h
@@ -0,0 +1,236 @@
+/*-------------------------------------------------------------------------
+ *
+ * planner.h
+ * Externally declared locator functions
+ *
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ * src/include/pgxc/planner.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PGXCPLANNER_H
+#define PGXCPLANNER_H
+
+#include "fmgr.h"
+#include "lib/stringinfo.h"
+#include "nodes/params.h"
+#include "nodes/parsenodes.h"
+#include "nodes/plannodes.h"
+#include "nodes/primnodes.h"
+#include "pgxc/locator.h"
+#include "tcop/dest.h"
+#include "nodes/relation.h"
+
+
+typedef enum
+{
+ COMBINE_TYPE_NONE, /* it is known that no row count, do not parse */
+ COMBINE_TYPE_SUM, /* sum row counts (partitioned, round robin) */
+ COMBINE_TYPE_SAME /* expect all row counts to be the same (replicated write) */
+} CombineType;
+
+/* For sorting within RemoteQuery handling */
+/*
+ * It is pretty much like Sort, but without Plan. We may use Sort later.
+ */
+typedef struct
+{
+ NodeTag type;
+ int numCols; /* number of sort-key columns */
+ AttrNumber *sortColIdx; /* their indexes in the target list */
+ Oid *sortOperators; /* OIDs of operators to sort them by */
+ Oid *sortCollations;
+ bool *nullsFirst; /* NULLS FIRST/LAST directions */
+} SimpleSort;
+
+/*
+ * Determines if query has to be launched
+ * on Coordinators only (SEQUENCE DDL),
+ * on Datanodes (normal Remote Queries),
+ * or on all Postgres-XC nodes (Utilities and DDL).
+ */
+typedef enum
+{
+#ifdef XCP
+ EXEC_ON_CURRENT,
+#endif
+ EXEC_ON_DATANODES,
+ EXEC_ON_COORDS,
+ EXEC_ON_ALL_NODES,
+ EXEC_ON_NONE
+} RemoteQueryExecType;
+
+typedef enum
+{
+ EXEC_DIRECT_NONE,
+ EXEC_DIRECT_LOCAL,
+ EXEC_DIRECT_LOCAL_UTILITY,
+ EXEC_DIRECT_UTILITY,
+ EXEC_DIRECT_SELECT,
+ EXEC_DIRECT_INSERT,
+ EXEC_DIRECT_UPDATE,
+ EXEC_DIRECT_DELETE
+} ExecDirectType;
+
+/*
+ * Contains instructions on processing a step of a query.
+ * In the prototype this will be simple, but it will eventually
+ * evolve into a GridSQL-style QueryStep.
+ */
+typedef struct
+{
+ Scan scan;
+ ExecDirectType exec_direct_type; /* track if remote query is execute direct and what type it is */
+ char *sql_statement;
+ ExecNodes *exec_nodes; /* List of Datanodes where to launch query */
+ CombineType combine_type;
+ SimpleSort *sort;
+ bool read_only; /* do not use 2PC when committing read only steps */
+ bool force_autocommit; /* some commands like VACUUM require autocommit mode */
+ char *statement; /* if specified use it as a PreparedStatement name on Datanodes */
+ char *cursor; /* if specified use it as a Portal name on Datanodes */
+ int remote_num_params; /* number of parameters specified for Prepared remote statement */
+ Oid *remote_param_types; /* parameter types, this pointer is shared
+ * across all the RemoteQuery nodes in the
+ * plan. So, don't change this once set.
+ */
+ RemoteQueryExecType exec_type;
+#ifndef XCP
+ bool is_temp; /* determine if this remote node is based
+ * on a temporary objects (no 2PC) */
+#endif
+ int reduce_level; /* in case of reduced JOIN, it's level */
+ List *base_tlist; /* in case of isReduced, the base tlist */
+ char *outer_alias;
+ char *inner_alias;
+ int outer_reduce_level;
+ int inner_reduce_level;
+ Relids outer_relids;
+ Relids inner_relids;
+ char *inner_statement;
+ char *outer_statement;
+ char *join_condition;
+ bool has_row_marks; /* Did SELECT had FOR UPDATE/SHARE? */
+ bool has_ins_child_sel_parent; /* This node is part of an INSERT SELECT that
+ * inserts into child by selecting from its parent */
+} RemoteQuery;
+
+
+#ifdef XCP
+/*
+ * Going to be a RemoteQuery replacement.
+ * Submit left subplan to the nodes defined by the Distribution and combine
+ * results.
+ */
+typedef struct
+{
+ Scan scan;
+ char distributionType;
+ AttrNumber distributionKey;
+ List *distributionNodes;
+ List *distributionRestrict;
+ List *nodeList;
+ bool execOnAll;
+ SimpleSort *sort;
+ char *cursor;
+ int unique;
+} RemoteSubplan;
+#endif
+
+
+/*
+ * FQS_context
+ * This context structure is used by the Fast Query Shipping walker, to gather
+ * information during analysing query for Fast Query Shipping.
+ */
+typedef struct
+{
+ bool sc_for_expr; /* if false, the we are checking shippability
+ * of the Query, otherwise, we are checking
+ * shippability of a stand-alone expression.
+ */
+ Bitmapset *sc_shippability; /* The conditions for (un)shippability of the
+ * query.
+ */
+ Query *sc_query; /* the query being analysed for FQS */
+ int sc_query_level; /* level of the query */
+ int sc_max_varlevelsup; /* maximum upper level referred to by any
+ * variable reference in the query. If this
+ * value is greater than 0, the query is not
+ * shippable, if shipped alone.
+ */
+ ExecNodes *sc_exec_nodes; /* nodes where the query should be executed */
+ ExecNodes *sc_subquery_en; /* ExecNodes produced by merging the ExecNodes
+ * for individual subqueries. This gets
+ * ultimately merged with sc_exec_nodes.
+ */
+} Shippability_context;
+
+/* enum for reasons as to why a query/expression is not FQSable */
+typedef enum
+{
+ SS_UNSHIPPABLE_EXPR = 0, /* it has unshippable expression */
+ SS_NEED_SINGLENODE, /* Has expressions which can be evaluated when
+ * there is only a single node involved.
+ * Athought aggregates too fit in this class, we
+ * have a separate status to report aggregates,
+ * see below.
+ */
+ SS_NEEDS_COORD, /* the query needs Coordinator */
+ SS_VARLEVEL, /* one of its subqueries has a VAR
+ * referencing an upper level query
+ * relation
+ */
+ SS_NO_NODES, /* no suitable nodes can be found to ship
+ * the query
+ */
+ SS_UNSUPPORTED_EXPR, /* it has expressions currently unsupported
+ * by FQS, but such expressions might be
+ * supported by FQS in future
+ */
+ SS_HAS_AGG_EXPR /* it has aggregate expressions */
+} ShippabilityStat;
+
+#ifndef XCP
+/* global variable corresponding to the GUC with same name */
+extern bool enable_fast_query_shipping;
+/* forbid SQL if unsafe, useful to turn off for development */
+extern bool StrictStatementChecking;
+
+/* forbid SELECT even multi-node ORDER BY */
+extern bool StrictSelectChecking;
+
+extern PlannedStmt *pgxc_planner(Query *query, int cursorOptions,
+ ParamListInfo boundParams);
+extern bool IsHashDistributable(Oid col_type);
+
+extern ExecNodes *IsJoinReducible(RemoteQuery *innernode, RemoteQuery *outernode,
+ Relids in_relids, Relids out_relids,
+ Join *join, JoinPath *join_path, List *rtable);
+
+extern List *AddRemoteQueryNode(List *stmts, const char *queryString,
+ RemoteQueryExecType remoteExecType, bool is_temp);
+extern bool pgxc_query_contains_temp_tables(List *queries);
+extern Expr *pgxc_find_distcol_expr(Index varno, PartAttrNumber partAttrNum,
+extern bool pgxc_query_contains_utility(List *queries);
+#endif
+extern bool pgxc_shippability_walker(Node *node, Shippability_context *sc_context);
+extern bool pgxc_test_shippability_reason(Shippability_context *context,
+ ShippabilityStat reason);
+
+#ifdef XCP
+extern PlannedStmt *pgxc_direct_planner(Query *query, int cursorOptions,
+ ParamListInfo boundParams);
+extern List *AddRemoteQueryNode(List *stmts, const char *queryString,
+ RemoteQueryExecType remoteExecType);
+#endif
+
+#endif /* PGXCPLANNER_H */
diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h
index 7181968166..1fc04f2007 100644
--- a/src/include/pgxc/poolmgr.h
+++ b/src/include/pgxc/poolmgr.h
@@ -5,6 +5,11 @@
* Definitions for the Datanode connection pool.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
@@ -24,6 +29,7 @@
#define MAX_IDLE_TIME 60
+#ifndef XCP
/*
* List of flags related to pooler connection clean up when disconnecting
* a session or relaeasing handles.
@@ -57,11 +63,16 @@ typedef enum
POOL_CMD_LOCAL_SET, /* Local SET flag, current transaction block only */
POOL_CMD_GLOBAL_SET /* Global SET flag */
} PoolCommandType;
+#endif
/* Connection pool entry */
typedef struct
{
+#ifdef XCP
+ time_t released;
+#else
struct timeval released;
+#endif
NODE_CONNECTION *conn;
NODE_CANCEL *xc_cancelConn;
} PGXCNodePoolSlot;
@@ -81,11 +92,16 @@ typedef struct databasepool
{
char *database;
char *user_name;
+#ifndef XCP
char *pgoptions; /* Connection options */
+#endif
HTAB *nodePools; /* Hashtable of PGXCNodePool, one entry for each
* Coordinator or DataNode */
MemoryContext mcxt;
struct databasepool *next; /* Reference to next to organize linked list */
+#ifdef XCP
+ time_t oldest_idle;
+#endif
} DatabasePool;
/*
@@ -107,19 +123,28 @@ typedef struct
Oid *coord_conn_oids; /* one for each Coordinator */
PGXCNodePoolSlot **dn_connections; /* one for each Datanode */
PGXCNodePoolSlot **coord_connections; /* one for each Coordinator */
+#ifndef XCP
char *session_params;
char *local_params;
bool is_temp; /* Temporary objects used for this pool session? */
+#endif
} PoolAgent;
+#ifndef XCP
/* Handle to the pool manager (Session's side) */
typedef struct
{
/* communication channel */
PoolPort port;
} PoolHandle;
+#endif
+#ifdef XCP
+extern int PoolConnKeepAlive;
+extern int PoolMaintenanceTimeout;
+#else
extern int MinPoolSize;
+#endif
extern int MaxPoolSize;
extern int PoolerPort;
@@ -135,6 +160,7 @@ extern int PoolManagerInit(void);
/* Destroy internal structures */
extern int PoolManagerDestroy(void);
+#ifndef XCP
/*
* Get handle to pool manager. This function should be called just before
* forking off new session. It creates PoolHandle, PoolAgent and a pipe between
@@ -150,12 +176,14 @@ extern PoolHandle *GetPoolManagerHandle(void);
* free memory occupied by PoolHandler
*/
extern void PoolManagerCloseHandle(PoolHandle *handle);
+#endif
/*
* Gracefully close connection to the PoolManager
*/
extern void PoolManagerDisconnect(void);
+#ifndef XCP
extern char *session_options(void);
/*
@@ -166,6 +194,7 @@ extern char *session_options(void);
extern void PoolManagerConnect(PoolHandle *handle,
const char *database, const char *user_name,
char *pgoptions);
+#endif
/*
* Reconnect to pool manager
@@ -173,6 +202,8 @@ extern void PoolManagerConnect(PoolHandle *handle,
*/
extern void PoolManagerReconnect(void);
+
+#ifndef XCP
/*
* Save a SET command in Pooler.
* This command is run on existent agent connections
@@ -180,6 +211,7 @@ extern void PoolManagerReconnect(void);
* are requested.
*/
extern int PoolManagerSetCommand(PoolCommandType command_type, const char *set_command);
+#endif
/* Get pooled connections */
extern int *PoolManagerGetConnections(List *datanodelist, List *coordlist);
@@ -197,7 +229,11 @@ extern void PoolManagerReloadConnectionInfo(void);
extern int PoolManagerAbortTransactions(char *dbname, char *username, int **proc_pids);
/* Return connections back to the pool, for both Coordinator and Datanode connections */
+#ifdef XCP
+extern void PoolManagerReleaseConnections(bool destroy);
+#else
extern void PoolManagerReleaseConnections(void);
+#endif
/* Cancel a running query on Datanodes as well as on other Coordinators */
extern void PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list);
@@ -205,10 +241,12 @@ extern void PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int
/* Lock/unlock pool manager */
extern void PoolManagerLock(bool is_lock);
+#ifndef XCP
/* Check if pool has a handle */
extern bool IsPoolHandle(void);
/* Send commands to alter the behavior of current transaction */
extern int PoolManagerSendLocalCommand(int dn_count, int* dn_list, int co_count, int* co_list);
+#endif
#endif
diff --git a/src/include/pgxc/postgresql_fdw.h b/src/include/pgxc/postgresql_fdw.h
new file mode 100644
index 0000000000..57ab2b7d1d
--- /dev/null
+++ b/src/include/pgxc/postgresql_fdw.h
@@ -0,0 +1,23 @@
+/*-------------------------------------------------------------------------
+ *
+ * postgresql_fdw.h
+ *
+ * foreign-data wrapper for PostgreSQL
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 2010-2012, Postgres-XC Development Group
+ *
+ * src/include/pgxc/postgresql_fdw.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef POSTGRES_FDW_H
+#define POSTGRES_FDW_H
+
+#include "postgres.h"
+#include "pgxc/execRemote.h"
+
+bool is_immutable_func(Oid funcid);
+bool pgxc_is_expr_shippable(Expr *node, bool *has_aggs);
+#endif
diff --git a/src/include/pgxc/remotecopy.h b/src/include/pgxc/remotecopy.h
index 93368c0ada..6adb386306 100644
--- a/src/include/pgxc/remotecopy.h
+++ b/src/include/pgxc/remotecopy.h
@@ -16,6 +16,9 @@
#define REMOTECOPY_H
#include "nodes/parsenodes.h"
+#ifdef XCP
+#include "pgxc/locator.h"
+#endif
/*
* This contains the set of data necessary for remote COPY control.
@@ -32,15 +35,21 @@ typedef struct RemoteCopyData {
* as copy source or destination
*/
StringInfoData query_buf;
-
+#ifdef XCP
+ Locator *locator; /* the locator object */
+ Oid dist_type; /* data type of the distribution column */
+#else
/* Execution nodes for COPY */
ExecNodes *exec_nodes;
+#endif
/* Locator information */
RelationLocInfo *rel_loc; /* the locator key */
+#ifndef XCP
int idx_dist_by_col; /* index of the distributed by column */
PGXCNodeHandle **connections; /* Involved Datanode connections */
+#endif
} RemoteCopyData;
/*
diff --git a/src/include/pgxc/squeue.h b/src/include/pgxc/squeue.h
new file mode 100644
index 0000000000..4cac658fb4
--- /dev/null
+++ b/src/include/pgxc/squeue.h
@@ -0,0 +1,60 @@
+/*-------------------------------------------------------------------------
+ *
+ * barrier.h
+ *
+ * Definitions for the shared queue handling
+ *
+ *
+ * Copyright (c) 2012-2014, TransLattice, Inc.
+ *
+ * IDENTIFICATION
+ * $$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef SQUEUE_H
+#define SQUEUE_H
+
+#include "postgres.h"
+#include "executor/tuptable.h"
+#include "nodes/pg_list.h"
+#include "utils/tuplestore.h"
+
+extern PGDLLIMPORT int NSQueues;
+extern PGDLLIMPORT int SQueueSize;
+
+/* Fixed size of shared queue, maybe need to be GUC configurable */
+#define SQUEUE_SIZE ((long) SQueueSize * 1024L)
+/* Number of shared queues, maybe need to be GUC configurable */
+#define NUM_SQUEUES ((long) NSQueues)
+
+#define SQUEUE_KEYSIZE (64)
+
+#define SQ_CONS_SELF -1
+#define SQ_CONS_NONE -2
+
+typedef struct SQueueHeader *SharedQueue;
+
+extern Size SharedQueueShmemSize(void);
+extern void SharedQueuesInit(void);
+extern void SharedQueueAcquire(const char *sqname, int ncons);
+extern SharedQueue SharedQueueBind(const char *sqname, List *consNodes,
+ List *distNodes, int *myindex, int *consMap);
+extern void SharedQueueUnBind(SharedQueue squeue);
+extern void SharedQueueRelease(const char *sqname);
+extern void SharedQueuesCleanup(int code, Datum arg);
+
+extern int SharedQueueFinish(SharedQueue squeue, TupleDesc tupDesc,
+ Tuplestorestate **tuplestore);
+
+extern void SharedQueueWrite(SharedQueue squeue, int consumerIdx,
+ TupleTableSlot *slot, Tuplestorestate **tuplestore,
+ MemoryContext tmpcxt);
+extern bool SharedQueueRead(SharedQueue squeue, int consumerIdx,
+ TupleTableSlot *slot, bool canwait);
+extern void SharedQueueReset(SharedQueue squeue, int consumerIdx);
+extern int SharedQueueResetNotConnected(SharedQueue squeue);
+extern bool SharedQueueCanPause(SharedQueue squeue);
+
+#endif
diff --git a/src/include/storage/backendid.h b/src/include/storage/backendid.h
index 8879e2129e..6b951b1566 100644
--- a/src/include/storage/backendid.h
+++ b/src/include/storage/backendid.h
@@ -4,6 +4,11 @@
* POSTGRES backend id communication definitions
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -24,4 +29,19 @@ typedef int BackendId; /* unique currently active backend identifier */
extern PGDLLIMPORT BackendId MyBackendId; /* backend id of this backend */
+#ifdef XCP
+/*
+ * Two next variables make up distributed session id. Actual distributed
+ * session id is a string, which includes coordinator node name, but
+ * it is better to use Oid to store and compare with distributed session ids
+ * of other backends under the same postmaster.
+ */
+extern PGDLLIMPORT Oid MyCoordId;
+
+extern PGDLLIMPORT int MyCoordPid;
+
+/* BackendId of the first backend of the distributed session on the node */
+extern PGDLLIMPORT BackendId MyFirstBackendId;
+#endif
+
#endif /* BACKENDID_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 7c0fb01cb4..6634554232 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -4,6 +4,11 @@
* Lightweight lock manager
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -75,6 +80,9 @@ typedef enum LWLockId
BarrierLock,
NodeTableLock,
#endif
+#ifdef XCP
+ SQueuesLock,
+#endif
RelationMappingLock,
AsyncCtlLock,
AsyncQueueLock,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 768e4f89df..8d861a6cfd 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -4,6 +4,11 @@
* per-process shared memory data structures
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -91,6 +96,12 @@ struct PGPROC
BackendId backendId; /* This backend's backend ID (if assigned) */
Oid databaseId; /* OID of database this backend is using */
Oid roleId; /* OID of role using this backend */
+#ifdef XCP
+ Oid coordId; /* Oid of originating coordinator */
+ int coordPid; /* Pid of the originating session */
+ BackendId firstBackendId; /* Backend ID of the first backend of
+ * the distributed session */
+#endif
/*
* While in hot standby mode, shows that a conflict signal has been sent
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index 73b3dabc9b..3bb98b0455 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -4,6 +4,11 @@
* POSTGRES process array definitions.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -81,5 +86,8 @@ extern bool CountOtherDBBackends(Oid databaseId,
extern void XidCacheRemoveRunningXids(TransactionId xid,
int nxids, const TransactionId *xids,
TransactionId latestXid);
-
+#ifdef XCP
+extern void GetGlobalSessionInfo(int pid, Oid *coordId, int *coordPid);
+extern int GetFirstBackendId(int *numBackends, int *backends);
+#endif /* XCP */
#endif /* PROCARRAY_H */
diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h
index a44659b064..c0127d4f25 100644
--- a/src/include/storage/procsignal.h
+++ b/src/include/storage/procsignal.h
@@ -4,6 +4,11 @@
* Routines for interprocess signalling
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index f8fc2b2d6e..18c8b98016 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -4,6 +4,11 @@
* storage manager switch public interface declarations.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -68,8 +73,14 @@ typedef struct SMgrRelationData
typedef SMgrRelationData *SMgrRelation;
+#ifdef XCP
+#define SmgrIsTemp(smgr) \
+ (!OidIsValid(MyCoordId) && \
+ ((smgr)->smgr_rnode.backend != InvalidBackendId))
+#else
#define SmgrIsTemp(smgr) \
((smgr)->smgr_rnode.backend != InvalidBackendId)
+#endif
extern void smgrinit(void);
extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend);
diff --git a/src/include/tcop/dest.h b/src/include/tcop/dest.h
index 1a8ff4a23b..cdeb8b4810 100644
--- a/src/include/tcop/dest.h
+++ b/src/include/tcop/dest.h
@@ -57,6 +57,11 @@
* calls in portal and cursor manipulations.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -94,6 +99,10 @@ typedef enum
DestIntoRel, /* results sent to relation (SELECT INTO) */
DestCopyOut, /* results sent to COPY TO code */
DestSQLFunction /* results sent to SQL-language func mgr */
+#ifdef XCP
+ ,
+ DestProducer /* results sent to a SharedQueue */
+#endif
} CommandDest;
/* ----------------
diff --git a/src/include/tcop/pquery.h b/src/include/tcop/pquery.h
index 22aad2e96c..d91c2a76a0 100644
--- a/src/include/tcop/pquery.h
+++ b/src/include/tcop/pquery.h
@@ -4,6 +4,11 @@
* prototypes for pquery.c.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -42,4 +47,9 @@ extern long PortalRunFetch(Portal portal,
long count,
DestReceiver *dest);
+#ifdef XCP
+extern int AdvanceProducingPortal(Portal portal, bool can_wait);
+extern void cleanupClosedProducers(void);
+#endif
+
#endif /* PQUERY_H */
diff --git a/src/include/tcop/utility.h b/src/include/tcop/utility.h
index 502406ce62..71554f8342 100644
--- a/src/include/tcop/utility.h
+++ b/src/include/tcop/utility.h
@@ -56,4 +56,8 @@ extern bool CommandIsReadOnly(Node *parsetree);
extern void CheckRelationOwnership(RangeVar *rel, bool noCatalogs);
+#ifdef PGXC
+extern bool pgxc_lock_for_utility_stmt(Node *parsetree);
+#endif
+
#endif /* UTILITY_H */
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 571697eb87..87b3a1403a 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -4,6 +4,11 @@
* Declarations for operations on built-in types.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -541,6 +546,7 @@ extern Datum void_recv(PG_FUNCTION_ARGS);
extern Datum void_send(PG_FUNCTION_ARGS);
#ifdef PGXC
extern Datum pgxc_node_str (PG_FUNCTION_ARGS);
+extern Datum pgxc_lock_for_backup (PG_FUNCTION_ARGS);
#endif
extern Datum trigger_in(PG_FUNCTION_ARGS);
extern Datum trigger_out(PG_FUNCTION_ARGS);
@@ -661,9 +667,9 @@ extern Datum pg_get_function_result(PG_FUNCTION_ARGS);
extern char *deparse_expression(Node *expr, List *dpcontext,
bool forceprefix, bool showimplicit);
#ifdef PGXC
-extern void deparse_query(Query *query, StringInfo buf, List *parentnamespace,
- bool finalise_aggs, bool sortgroup_colno);
-extern void deparse_targetlist(Query *query, List *targetList, StringInfo buf);
+extern List *deparse_context_for_remotequery(Alias *aliasname, Oid relid);
+extern void get_query_def_from_valuesList(Query *query, StringInfo buf);
+extern void deparse_query(Query *query, StringInfo buf, List *parentnamespace);
#endif
extern List *deparse_context_for(const char *aliasname, Oid relid);
extern List *deparse_context_for_planstate(Node *planstate, List *ancestors,
@@ -809,8 +815,10 @@ extern Datum text_format_nv(PG_FUNCTION_ARGS);
/* version.c */
extern Datum pgsql_version(PG_FUNCTION_ARGS);
#ifdef PGXC
+#ifndef XCP
extern Datum pgxc_version(PG_FUNCTION_ARGS);
#endif
+#endif
/* xid.c */
extern Datum xidin(PG_FUNCTION_ARGS);
@@ -1182,6 +1190,11 @@ extern Datum pg_cursor(PG_FUNCTION_ARGS);
/* backend/pgxc/pool/poolutils.c */
extern Datum pgxc_pool_check(PG_FUNCTION_ARGS);
extern Datum pgxc_pool_reload(PG_FUNCTION_ARGS);
+
+#ifdef XCP
+/* backend/pgxc/cluster/stormutils.c */
+extern Datum stormdb_promote_standby(PG_FUNCTION_ARGS);
+#endif
#endif
/* backend/access/transam/transam.c */
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 1a28efcc3f..94a8a174ac 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -4,6 +4,11 @@
* External declarations pertaining to backend/utils/misc/guc.c and
* backend/utils/misc/guc-file.l
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Copyright (c) 2000-2012, PostgreSQL Global Development Group
* Written by Peter Eisentraut <[email protected]>.
*
@@ -225,6 +230,10 @@ extern int tcp_keepalives_idle;
extern int tcp_keepalives_interval;
extern int tcp_keepalives_count;
+#ifdef XCP
+extern char *storm_catalog_remap_string;
+#endif
+
/*
* Functions exported by guc.c
*/
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index 7709a3a088..471c2492b9 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -3,6 +3,11 @@
* lsyscache.h
* Convenience routines for common queries in the system catalog cache.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -69,6 +74,11 @@ extern int32 get_atttypmod(Oid relid, AttrNumber attnum);
extern void get_atttypetypmodcoll(Oid relid, AttrNumber attnum,
Oid *typid, int32 *typmod, Oid *collid);
extern char *get_collation_name(Oid colloid);
+#ifdef XCP
+extern Oid get_collation_namespace(Oid colloid);
+extern int32 get_collation_encoding(Oid colloid);
+extern Oid get_collid(const char *collname, int32 collencoding, Oid collnsp);
+#endif
extern char *get_constraint_name(Oid conoid);
extern Oid get_opclass_family(Oid opclass);
extern Oid get_opclass_input_type(Oid opclass);
@@ -166,6 +176,15 @@ extern void free_attstatsslot(Oid atttype,
Datum *values, int nvalues,
float4 *numbers, int nnumbers);
extern char *get_namespace_name(Oid nspid);
+#ifdef XCP
+extern Oid get_namespaceid(const char *nspname);
+extern char *get_typ_name(Oid typid);
+extern Oid get_typ_namespace(Oid typid);
+extern Oid get_typname_typid(const char *typname, Oid typnamespace);
+extern Oid get_funcid(const char *funcname, oidvector *argtypes, Oid funcnsp);
+extern Oid get_opnamespace(Oid opno);
+extern Oid get_operid(const char *oprname, Oid oprleft, Oid oprright, Oid oprnsp);
+#endif
extern Oid get_range_subtype(Oid rangeOid);
#define type_is_array(typid) (get_element_type(typid) != InvalidOid)
diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h
index da66ac30b0..efe2a8e3ec 100644
--- a/src/include/utils/plancache.h
+++ b/src/include/utils/plancache.h
@@ -5,6 +5,11 @@
*
* See plancache.c for comments.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -161,5 +166,9 @@ extern CachedPlan *GetCachedPlan(CachedPlanSource *plansource,
ParamListInfo boundParams,
bool useResOwner);
extern void ReleaseCachedPlan(CachedPlan *plan, bool useResOwner);
+#ifdef XCP
+extern void SetRemoteSubplan(CachedPlanSource *plansource,
+ const char *plan_string);
+#endif
#endif /* PLANCACHE_H */
diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h
index 4833942654..5c883ace6b 100644
--- a/src/include/utils/portal.h
+++ b/src/include/utils/portal.h
@@ -36,6 +36,11 @@
* to look like NO SCROLL cursors.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -91,6 +96,10 @@ typedef enum PortalStrategy
PORTAL_ONE_MOD_WITH,
PORTAL_UTIL_SELECT,
PORTAL_MULTI_QUERY
+#ifdef XCP
+ ,
+ PORTAL_DISTRIBUTED
+#endif
} PortalStrategy;
/*
@@ -156,6 +165,9 @@ typedef struct PortalData
*/
Tuplestorestate *holdStore; /* store for holdable cursors */
MemoryContext holdContext; /* memory containing holdStore */
+#ifdef XCP
+ MemoryContext tmpContext; /* temporary memory */
+#endif
/*
* atStart, atEnd and portalPos indicate the current cursor position.
@@ -219,5 +231,12 @@ extern void PortalDefineQuery(Portal portal,
extern Node *PortalListGetPrimaryStmt(List *stmts);
extern void PortalCreateHoldStore(Portal portal);
extern void PortalHashTableDeleteAll(void);
+#ifdef XCP
+extern void PortalCreateProducerStore(Portal portal);
+extern List *getProducingPortals(void);
+extern void addProducingPortal(Portal portal);
+extern void removeProducingPortal(Portal portal);
+extern bool portalIsProducing(Portal portal);
+#endif
#endif /* PORTAL_H */
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index ff3eaec84d..3a5b6a6053 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -4,6 +4,11 @@
* POSTGRES relation descriptor (a/k/a relcache entry) definitions.
*
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
@@ -26,6 +31,9 @@
#endif
#include "rewrite/prs2lock.h"
#include "storage/block.h"
+#ifdef XCP
+#include "storage/proc.h"
+#endif
#include "storage/relfilenode.h"
#include "utils/relcache.h"
#include "utils/reltrigger.h"
@@ -366,15 +374,14 @@ typedef struct StdRdOptions
* RelationUsesLocalBuffers
* True if relation's pages are stored in local buffers.
*/
+#ifdef XCP
+#define RelationUsesLocalBuffers(relation) \
+ !OidIsValid(MyCoordId) && \
+ ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
+#else
#define RelationUsesLocalBuffers(relation) \
((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
-
-/*
- * RelationUsesTempNamespace
- * True if relation's catalog entries live in a private namespace.
- */
-#define RelationUsesTempNamespace(relation) \
- ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
+#endif
#ifdef PGXC
/*
@@ -385,15 +392,40 @@ typedef struct StdRdOptions
#endif
/*
+ * RelationUsesTempNamespace
+ * True if relation's catalog entries live in a private namespace.
+ */
+#define RelationUsesTempNamespace(relation) \
+ ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
+
+/*
* RELATION_IS_LOCAL
* If a rel is either temp or newly created in the current transaction,
* it can be assumed to be visible only to the current backend.
*
* Beware of multiple eval of argument
*/
+#ifdef XCP
+#define RELATION_IS_LOCAL(relation) \
+ ((!OidIsValid(MyCoordId) && (relation)->rd_backend == MyBackendId) || \
+ (OidIsValid(MyCoordId) && (relation)->rd_backend == MyFirstBackendId) || \
+ ((relation)->rd_backend == MyBackendId || \
+ (relation)->rd_createSubid != InvalidSubTransactionId))
+#else
#define RELATION_IS_LOCAL(relation) \
((relation)->rd_backend == MyBackendId || \
(relation)->rd_createSubid != InvalidSubTransactionId)
+#endif
+
+#ifdef XCP
+/*
+ * RelationGetLocatorType
+ * Returns the rel's locator type.
+ */
+#define RelationGetLocatorType(relation) \
+ ((relation)->rd_locator_info->locatorType)
+
+#endif
/*
* RELATION_IS_OTHER_TEMP
@@ -401,9 +433,17 @@ typedef struct StdRdOptions
*
* Beware of multiple eval of argument
*/
+#ifdef XCP
+#define RELATION_IS_OTHER_TEMP(relation) \
+ (((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP && \
+ (relation)->rd_backend != MyBackendId) && \
+ ((!OidIsValid(MyCoordId) && (relation)->rd_backend != MyBackendId) || \
+ (OidIsValid(MyCoordId) && (relation)->rd_backend != MyFirstBackendId)))
+#else
#define RELATION_IS_OTHER_TEMP(relation) \
((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP \
&& (relation)->rd_backend != MyBackendId)
+#endif
/* routines in utils/cache/relcache.c */
extern void RelationIncrementReferenceCount(Relation rel);
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index 249c2407e9..6c0d024cd0 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -10,6 +10,11 @@
* amounts are sorted using temporary files and a standard external sort
* algorithm.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -83,7 +88,11 @@ extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
extern Tuplesortstate *tuplesort_begin_merge(TupleDesc tupDesc,
int nkeys, AttrNumber *attNums,
Oid *sortOperators, Oid *sortCollations, bool *nullsFirstFlags,
+#ifdef XCP
+ ResponseCombiner *combiner,
+#else
RemoteQueryState *combiner,
+#endif
int workMem);
#endif
diff --git a/src/include/utils/tuplestore.h b/src/include/utils/tuplestore.h
index fd2ba4c75b..53c56ceea3 100644
--- a/src/include/utils/tuplestore.h
+++ b/src/include/utils/tuplestore.h
@@ -21,6 +21,11 @@
* Also, we have changed the API to return tuples in TupleTableSlots,
* so that there is a check to prevent attempted access to system columns.
*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Portions Copyright (c) 2012-2014, TransLattice, Inc.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
@@ -82,4 +87,14 @@ extern void tuplestore_clear(Tuplestorestate *state);
extern void tuplestore_end(Tuplestorestate *state);
+#ifdef XCP
+extern Tuplestorestate *tuplestore_begin_datarow(bool interXact, int maxKBytes,
+ MemoryContext tmpcxt);
+extern Tuplestorestate *tuplestore_begin_message(bool interXact, int maxKBytes);
+extern void tuplestore_putmessage(Tuplestorestate *state, int len, char* msg);
+extern char *tuplestore_getmessage(Tuplestorestate *state, int *len);
+#endif
+
+extern void tuplestore_collect_stat(Tuplestorestate *state, char *name);
+
#endif /* TUPLESTORE_H */
diff --git a/src/pl/plperl/expected/plperl_lc.out b/src/pl/plperl/expected/plperl_lc.out
new file mode 100644
index 0000000000..23c5fcb486
--- /dev/null
+++ b/src/pl/plperl/expected/plperl_lc.out
@@ -0,0 +1,23 @@
+CREATE OR REPLACE FUNCTION perl_0x80_in(text) RETURNS BOOL AS $$
+ return ($_[0] eq "abc\x80de" ? "true" : "false");
+$$ LANGUAGE plperl;
+SELECT perl_0x80_in(E'abc\x80de');
+ERROR: invalid byte sequence for encoding "UTF8": 0x80
+CREATE OR REPLACE FUNCTION perl_0x80_out() RETURNS TEXT AS $$
+ return "abc\x80de";
+$$ LANGUAGE plperl;
+SELECT perl_0x80_out() = E'abc\x80de';
+ERROR: invalid byte sequence for encoding "UTF8": 0x80
+CREATE OR REPLACE FUNCTION perl_utf_inout(text) RETURNS TEXT AS $$
+ $str = $_[0]; $code = "NotUTF8:"; $match = "ab\xe5\xb1\xb1cd";
+ if (utf8::is_utf8($str)) {
+ $code = "UTF8:"; utf8::decode($str); $match="ab\x{5c71}cd";
+ }
+ return ($str ne $match ? $code."DIFFER" : $code."ab\x{5ddd}cd");
+$$ LANGUAGE plperl;
+SELECT encode(perl_utf_inout(E'ab\xe5\xb1\xb1cd')::bytea, 'escape')
+ encode
+-----------------------
+ UTF8:ab\345\267\235cd
+(1 row)
+
diff --git a/src/pl/plperl/expected/plperl_lc_1.out b/src/pl/plperl/expected/plperl_lc_1.out
new file mode 100644
index 0000000000..ae873d4322
--- /dev/null
+++ b/src/pl/plperl/expected/plperl_lc_1.out
@@ -0,0 +1,31 @@
+CREATE OR REPLACE FUNCTION perl_0x80_in(text) RETURNS BOOL AS $$
+ return ($_[0] eq "abc\x80de" ? "true" : "false");
+$$ LANGUAGE plperl;
+SELECT perl_0x80_in(E'abc\x80de');
+ perl_0x80_in
+--------------
+ t
+(1 row)
+
+CREATE OR REPLACE FUNCTION perl_0x80_out() RETURNS TEXT AS $$
+ return "abc\x80de";
+$$ LANGUAGE plperl;
+SELECT perl_0x80_out() = E'abc\x80de';
+ ?column?
+----------
+ t
+(1 row)
+
+CREATE OR REPLACE FUNCTION perl_utf_inout(text) RETURNS TEXT AS $$
+ $str = $_[0]; $code = "NotUTF8:"; $match = "ab\xe5\xb1\xb1cd";
+ if (utf8::is_utf8($str)) {
+ $code = "UTF8:"; utf8::decode($str); $match="ab\x{5c71}cd";
+ }
+ return ($str ne $match ? $code."DIFFER" : $code."ab\x{5ddd}cd");
+$$ LANGUAGE plperl;
+SELECT encode(perl_utf_inout(E'ab\xe5\xb1\xb1cd')::bytea, 'escape')
+ encode
+--------------------------
+ NotUTF8:ab\345\267\235cd
+(1 row)
+
diff --git a/src/pl/plperl/sql/plperl_lc.sql b/src/pl/plperl/sql/plperl_lc.sql
new file mode 100644
index 0000000000..6c2026414e
--- /dev/null
+++ b/src/pl/plperl/sql/plperl_lc.sql
@@ -0,0 +1,16 @@
+CREATE OR REPLACE FUNCTION perl_0x80_in(text) RETURNS BOOL AS $$
+ return ($_[0] eq "abc\x80de" ? "true" : "false");
+$$ LANGUAGE plperl;
+SELECT perl_0x80_in(E'abc\x80de');
+CREATE OR REPLACE FUNCTION perl_0x80_out() RETURNS TEXT AS $$
+ return "abc\x80de";
+$$ LANGUAGE plperl;
+SELECT perl_0x80_out() = E'abc\x80de';
+CREATE OR REPLACE FUNCTION perl_utf_inout(text) RETURNS TEXT AS $$
+ $str = $_[0]; $code = "NotUTF8:"; $match = "ab\xe5\xb1\xb1cd";
+ if (utf8::is_utf8($str)) {
+ $code = "UTF8:"; utf8::decode($str); $match="ab\x{5c71}cd";
+ }
+ return ($str ne $match ? $code."DIFFER" : $code."ab\x{5ddd}cd");
+$$ LANGUAGE plperl;
+SELECT encode(perl_utf_inout(E'ab\xe5\xb1\xb1cd')::bytea, 'escape')
diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c
index 470586e3db..0a6ae59552 100644
--- a/src/pl/plpgsql/src/pl_exec.c
+++ b/src/pl/plpgsql/src/pl_exec.c
@@ -3048,6 +3048,17 @@ exec_stmt_execsql(PLpgSQL_execstate *estate,
q->commandType == CMD_UPDATE ||
q->commandType == CMD_DELETE)
stmt->mod_stmt = true;
+ /* PGXCTODO: Support a better parameter interface for XC with DMLs */
+ if (q->commandType == CMD_INSERT ||
+ q->commandType == CMD_UPDATE ||
+ q->commandType == CMD_DELETE)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+#ifdef XCP
+ errmsg("Postgres-XL does not support DML queries in PL/pgSQL")));
+#else
+ errmsg("Postgres-XC does not support DML queries in PL/pgSQL")));
+#endif
}
}
}