summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Paquier2012-07-24 07:13:55 +0000
committerMichael Paquier2012-07-24 07:35:37 +0000
commitd03ea805cef9375bee9b751e65d698c07c138bf5 (patch)
tree2e578af76c1ac515887ff0363f5224f57af64a92
parentbaa8c4a51cdd7de321169f12ebfb47b02fed3afc (diff)
Support for online data redistribution with ALTER TABLE
Online data redistribution is the possibility for a user to change the distribution strategy of a table. There are no restrictions in the modifications possible, meaning that all types of tables with all possible node subsets can be completely changed in one command. The SQL command used for redistribution is an extension of ALTER TABLE with those clauses specific to XC and already available in CREATE TABLE: DISTRIBUTE BY { REPLICATION | ROUND ROBIN | { [HASH | MODULO ] ( column_name ) } } TO { GROUP groupname | NODE ( nodename [, ... ] ) } ADD NODE ( nodename [, ... ] ) DELETE NODE ( nodename [, ... ] ) Those commands can be combined together without limitations. Several redistribution scenarios are implemented depending on the old and new distribution type of the table: - Default scenario: 1) Fetch the data of the table with a COPY TO and store it inside a tuplestore 2) Perform a TRUNCATE on the Datanodes 3) Perform a COPY TO with tuples inside tuplestore 4) REINDEX table if necessary This default scenario could also be managed by an external tool, however all the following optimizations need a node-level control to perform with highest efficiency possible. The performance of this scenario is equivalent to running a COPY TO/COPY FROM sequence on a table, so here performance is not bounded by the redistribution mechanism itself but by the COPY protocol used for data exchanged in network. - Replicated to replicated: In case of nodes removed from the set of nodes, those nodes are simply truncated, so this is really quick even on large sets of data. For new nodes, data is fetched on Coordinator from one Datanode with COPY TO, data is stored in a tuplestore, and then COPY FROM is launched only on the new nodes. - Replicated to distributed: If new nodes are added, a fallback to default scenario is made. If nodes are removed, those nodes are truncated. Finally, on the remaining nodes a DELETE query removing only the necessary tuples is launched to each remote node. In this case there is no data exchanged between nodes so performance is maximized. In order to support all those scenarios, a couple of new internal mechanisms have been added to XC: materialization on Coordinator of tuple slots and possibility to reuse them for redistribution purposes, externalization of a portion of PostgreSQL COPY code used by redistribution, reuse and extension of Postgres-XC APIs for remote COPY management. The tuplestore used to store tuples if necessary can have its allowed cache controlled with work_mem. The only thing to take care of is that the tuplestore data needs to be stored on Coordinator once so some additional disk space might be necessary on this server to perform redistribution correctly. Documentation, as well as a new set of regression tests have been added. Regressions do checks on views, prepared statementsm, views, distribution types and subsets in a way completely transparent whatever the cluster configuration.
-rw-r--r--doc-xc/src/sgml/ref/alter_table.sgmlin224
-rw-r--r--src/backend/access/hash/hashfunc.c81
-rw-r--r--src/backend/catalog/heap.c28
-rw-r--r--src/backend/catalog/pgxc_class.c106
-rw-r--r--src/backend/commands/copy.c22
-rw-r--r--src/backend/commands/tablecmds.c531
-rw-r--r--src/backend/parser/gram.y34
-rw-r--r--src/backend/parser/parse_utilcmd.c4
-rw-r--r--src/backend/pgxc/copy/Makefile2
-rw-r--r--src/backend/pgxc/copy/copyops.c496
-rw-r--r--src/backend/pgxc/copy/remotecopy.c22
-rw-r--r--src/backend/pgxc/locator/Makefile4
-rw-r--r--src/backend/pgxc/locator/locator.c37
-rw-r--r--src/backend/pgxc/locator/redistrib.c871
-rw-r--r--src/backend/pgxc/nodemgr/nodemgr.c3
-rw-r--r--src/backend/pgxc/pool/execRemote.c133
-rw-r--r--src/include/access/hash.h1
-rw-r--r--src/include/catalog/pgxc_class.h25
-rw-r--r--src/include/nodes/parsenodes.h6
-rw-r--r--src/include/pgxc/copyops.h27
-rw-r--r--src/include/pgxc/execRemote.h17
-rw-r--r--src/include/pgxc/locator.h1
-rw-r--r--src/include/pgxc/redistrib.h80
-rw-r--r--src/include/pgxc/remotecopy.h1
-rw-r--r--src/include/utils/rel.h8
-rw-r--r--src/test/regress/expected/xc_alter_table.out408
-rw-r--r--src/test/regress/expected/xc_create_function.out84
-rw-r--r--src/test/regress/sql/xc_alter_table.sql133
-rw-r--r--src/test/regress/sql/xc_create_function.sql85
29 files changed, 3416 insertions, 58 deletions
diff --git a/doc-xc/src/sgml/ref/alter_table.sgmlin b/doc-xc/src/sgml/ref/alter_table.sgmlin
index 3a1f095e15..9116c8313e 100644
--- a/doc-xc/src/sgml/ref/alter_table.sgmlin
+++ b/doc-xc/src/sgml/ref/alter_table.sgmlin
@@ -67,6 +67,10 @@ ALTER TABLE <replaceable class="PARAMETER">name</replaceable>
NOT OF
OWNER TO <replaceable class="PARAMETER">new_owner</replaceable>
SET TABLESPACE <replaceable class="PARAMETER">new_tablespace</replaceable>
+ DISTRIBUTE BY { REPLICATION | ROUND ROBIN | { [HASH | MODULO ] ( <replaceable class="PARAMETER">column_name</replaceable> ) } }
+ TO { GROUP <replaceable class="PARAMETER">groupname</replaceable> | NODE ( <replaceable class="PARAMETER">nodename</replaceable> [, ... ] ) }
+ ADD NODE ( <replaceable class="PARAMETER">nodename</replaceable> [, ... ] )
+ DELETE NODE ( <replaceable class="PARAMETER">nodename</replaceable> [, ... ] )
<phrase>and <replaceable class="PARAMETER">table_constraint_using_index</replaceable> is:</phrase>
@@ -573,6 +577,111 @@ ALTER TABLE <replaceable class="PARAMETER">name</replaceable>
</listitem>
</varlistentry>
+<!## XC>
+ <varlistentry>
+ <term><literal>DISTRIBUTE BY</literal></term>
+ <listitem>
+&xconly;
+ <para>
+ This clause specifies how the table is distributed or replicated among Datanodes.
+ </para>
+
+ <variablelist>
+
+ <varlistentry>
+ <term><literal>REPLICATION</literal></term>
+ <listitem>
+ <para>
+ Each row of the table will be replicated into all the
+ Datanode of the <productname>Postgres-XC</> database
+ cluster.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>ROUND ROBIN</literal></term>
+ <listitem>
+ <para>
+ Each row of the table will be placed in one of the Datanodes
+ by round-robin manner. The value of the row will not be
+ needed to determine what Datanode to go.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>HASH ( <replaceable class="PARAMETER">column_name</> )</literal></term>
+ <listitem>
+ <para>
+ Each row of the table will be placed based on the hash value
+ of the specified column. Following type is allowed as
+ distribution column: INT8, INT2, OID, INT4, BOOL, INT2VECTOR,
+ OIDVECTOR, CHAR, NAME, TEXT, BPCHAR, BYTEA, VARCHAR, FLOAT4,
+ FLOAT8, NUMERIC, CASH, ABSTIME, RELTIME, DATE, TIME,
+ TIMESTAMP, TIMESTAMPTZ, INTERVAL, and TIMETZ.
+ </para>
+ <para>
+ Please note that floating point is not allowed as a basis of
+ the distribution column.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>MODULO ( <replaceable class="PARAMETER">column_name</> )</literal></term>
+ <listitem>
+ <para>
+ Each row of the table will be placed based on the modulo
+ of the specified column. Following type is allowed as
+ distribution column: INT8, INT2, OID, INT4, BOOL, INT2VECTOR,
+ OIDVECTOR, CHAR, NAME, TEXT, BPCHAR, BYTEA, VARCHAR, FLOAT4,
+ FLOAT8, NUMERIC, CASH, ABSTIME, RELTIME, DATE, TIME,
+ TIMESTAMP, TIMESTAMPTZ, INTERVAL, and TIMETZ.
+ </para>
+ <para>
+ Please note that floating point is not allowed as a basis of
+ the distribution column.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>TO GROUP</literal></term>
+ <term><literal>TO NODE</literal></term>
+ <listitem>
+ <para>
+ This defines the list of nodes on which table data exists.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>ADD NODE</literal></term>
+ <listitem>
+ <para>
+ This adds a list of nodes where data of table is distributed
+ to the existing list. If the list of nodes added contains nodes
+ already used by table, an error is returned.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><literal>DELETE NODE</literal></term>
+ <listitem>
+ <para>
+ This deletes a list of nodes where data of table is distributed
+ to the existing list. If the list of nodes deleted contains nodes
+ not used by table, an error is returned.
+ </para>
+ </listitem>
+ </varlistentry>
+<!## end>
+
</variablelist>
</para>
@@ -789,7 +898,26 @@ ALTER TABLE <replaceable class="PARAMETER">name</replaceable>
</listitem>
</varlistentry>
+<!## XC>
+ <varlistentry>
+ <term><replaceable class="PARAMETER">nodename</replaceable></term>
+ <listitem>
+ <para>
+ It defines a <productname>Postgres-XC</productname> node of catalog pgxc_node.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><replaceable class="PARAMETER">groupname</replaceable></term>
+ <listitem>
+ <para>
+ It defines a <productname>Postgres-XC</productname> node group in catalog pgxc_group.
+ </para>
+ </listitem>
+ </varlistentry>
</variablelist>
+<!## end>
</refsect1>
<refsect1>
@@ -904,10 +1032,74 @@ ALTER TABLE <replaceable class="PARAMETER">name</replaceable>
<!## XC>
&xconly;
<para>
- Please note that except for the column name, you cannot alter
- attribute of table distribution as specified
- with <literal>DISTRIBUTE BY</> clause in <literal>CREATE TABLE</>
- statement.
+ <command>ALTER TABLE</> with clauses <literal>DISTRIBUTE BY</>, <literal>ADD NODE</>,
+ <literal>DELETE NODE</>, <literal>TO NODE</> or <literal>TO GROUP</> is used for data
+ redistribution among nodes specific to <productname>Postgres-XC</>. Those clauses cannot be
+ used with other commands.
+ </para>
+
+ <para>
+ Multiple redistribution scenarios are possible depending on modifications done:
+ <variablelist>
+ <varlistentry>
+ <term>Default redistribution:</term>
+ <listitem>
+ <para>
+ This is the slowest scenario possible. It is done in 3 or 4 steps. Data is firstly
+ saved on Coordinator by fetching all the data with <command>COPY TO</> command. At
+ this point all the tuples are saved using tuple store. The amount of cache allowed for
+ tuple store operation can be controlled with <varname>work_mem</>. Then the table is
+ truncated on all the nodes. Then catalogs are updated. Finally data inside tuple store
+ is redistributed using an internal <command>COPY FROM</> mechanism. <command>REINDEX</>
+ is issued if necessary. The overall performance of this scenario is close to the
+ time necessary to run consecutively <command>COPY TO</> and <command>COPY FROM</>.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>Redistribution from replicated to replicated table:</term>
+ <listitem>
+ <para>
+ The node list of a table can have new nodes as well as removed nodes.
+ If nodes are only removed, <command>TRUNCATE</> is launched to remote nodes that are
+ removed. If new nodes are added, then table data is fetch on Coordinator with <command>
+ COPY TO</> and stored inside a tuplestore controlled with <varname>work_mem</>, then
+ data stored is only sent to the new nodes using <command>COPY FROM</> with data stored
+ inside the tuplestore. <command>REINDEX</> is issued if necessary.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>Redistribution from replicated to distributed table:</term>
+ <listitem>
+ <para>
+ If the relation node list contains new nodes, the default redistribution
+ mechanism is used. However, if the node list of relation after redistribution is
+ included in node list of relation after redistribution, as all the tuples are already
+ located on remote nodes, it is not necessary to fetch any data on Coordinator. Hence,
+ <command>DELETE</> is used to remove on remote nodes only the necessary tuples. This
+ query uses selects tuples to remove with conditions based on the number of nodes in node
+ list of relation after redistribution, the <literal>HASH</> or <literal>MODULO</> value
+ used for new distribution and the remote node itself where <command>DELETE</> is launched..
+ <command>REINDEX</> is issued if necessary.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>Redistribution from distributed to replicated table:</term>
+ <listitem>
+ <para>
+ In this case the default redistribution mechanism is used.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ <para>
+
+ <para>
</para>
<!## end>
</refsect1>
@@ -1055,6 +1247,30 @@ ALTER TABLE distributors DROP CONSTRAINT distributors_pkey,
</programlisting>
</para>
+<!## XC>
+ <para>
+ To change the distribution type and the list of nodes where table data
+ is located:
+<programlisting>
+ALTER TABLE distributors TO NODE (dn1, dn7), DISTRIBUTE BY HASH(dist_id);
+</programlisting>
+ </para>
+
+ <para>
+ To add a node where data of table is distributed:
+<programlisting>
+ALTER TABLE distributors ADD NODE (dn9, dn14);
+</programlisting>
+ </para>
+
+ <para>
+ To remove a node where data of table is distributed:
+<programlisting>
+ALTER TABLE distributors DELETE NODE (dn4, dn0);
+</programlisting>
+ </para>
+<!## end>
+
</refsect1>
<refsect1>
diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
index 9cb17eb4f7..f4a14e3229 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -531,8 +531,8 @@ hash_uint32(uint32 k)
#ifdef PGXC
/*
- * compute_hash() -- Generaic hash function for all datatypes
- *
+ * compute_hash()
+ * Generic hash function for all datatypes
*/
Datum
compute_hash(Oid type, Datum value, char locator)
@@ -637,4 +637,81 @@ compute_hash(Oid type, Datum value, char locator)
return (Datum)0;
}
+
+/*
+ * get_compute_hash_function
+ * Get hash function name depending on the hash type.
+ * For some cases of hash or modulo distribution, a function might
+ * be required or not.
+ */
+char *
+get_compute_hash_function(Oid type, char locator)
+{
+ switch (type)
+ {
+ case INT8OID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashint8";
+ return NULL;
+ case INT2OID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashint2";
+ return NULL;
+ case OIDOID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashoid";
+ return NULL;
+ case DATEOID:
+ case INT4OID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashint4";
+ return NULL;
+ case BOOLOID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashchar";
+ return NULL;
+ case CHAROID:
+ return "hashchar";
+ case NAMEOID:
+ return "hashname";
+ case INT2VECTOROID:
+ return "hashint2vector";
+ case VARCHAROID:
+ case TEXTOID:
+ return "hashtext";
+ case OIDVECTOROID:
+ return "hashoidvector";
+ case FLOAT4OID:
+ return "hashfloat4";
+ case FLOAT8OID:
+ return "hashfloat8";
+ case RELTIMEOID:
+ case ABSTIMEOID:
+ if (locator == LOCATOR_TYPE_HASH)
+ return "hashint4";
+ return NULL;
+ case CASHOID:
+ return "hashint8";
+ case BPCHAROID:
+ return "hashbpchar";
+ case BYTEAOID:
+ return "hashvarlena";
+ case TIMEOID:
+ return "time_hash";
+ case TIMESTAMPOID:
+ case TIMESTAMPTZOID:
+ return "timestamp_hash";
+ case INTERVALOID:
+ return "interval_hash";
+ case TIMETZOID:
+ return "timetz_hash";
+ case NUMERICOID:
+ return "hash_numeric";
+ default:
+ ereport(ERROR,(errmsg("Unhandled datatype for modulo or hash distribution\n")));
+ }
+
+ /* Keep compiler quiet */
+ return NULL;
+}
#endif
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 18248f4193..f797a0b75f 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -937,13 +937,13 @@ cmp_nodes(const void *p1, const void *p2)
}
/* --------------------------------
- * AddRelationDistribution
+ * AddRelationDistribution
*
* Add to pgxc_class table
* --------------------------------
*/
-void
-AddRelationDistribution(Oid relid,
+void
+AddRelationDistribution(Oid relid,
DistributeBy *distributeby,
PGXCSubCluster *subcluster,
List *parentOids,
@@ -1007,7 +1007,7 @@ GetRelationDistributionItems(Oid relid,
if (!distributeby)
{
- /*
+ /*
* If no distribution was specified, and we have not chosen
* one based on primary key or foreign key, use first column with
* a supported data type.
@@ -1032,9 +1032,9 @@ GetRelationDistributionItems(Oid relid,
if (local_attnum == 0)
local_locatortype = LOCATOR_TYPE_RROBIN;
}
- else
+ else
{
- /*
+ /*
* User specified distribution type
*/
switch (distributeby->disttype)
@@ -1051,12 +1051,12 @@ GetRelationDistributionItems(Oid relid,
(errcode(ERRCODE_INVALID_TABLE_DEFINITION),
errmsg("Invalid distribution column specified")));
}
-
+
if (!IsTypeHashDistributable(descriptor->attrs[local_attnum - 1]->atttypid))
{
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
- errmsg("Column %s is not a hash distributable data type",
+ errmsg("Column %s is not a hash distributable data type",
distributeby->colname)));
}
local_locatortype = LOCATOR_TYPE_HASH;
@@ -1108,10 +1108,14 @@ GetRelationDistributionItems(Oid relid,
}
/* Save results */
- *attnum = local_attnum;
- *hashalgorithm = local_hashalgorithm;
- *hashbuckets = local_hashbuckets;
- *locatortype = local_locatortype;
+ if (attnum)
+ *attnum = local_attnum;
+ if (hashalgorithm)
+ *hashalgorithm = local_hashalgorithm;
+ if (hashbuckets)
+ *hashbuckets = local_hashbuckets;
+ if (locatortype)
+ *locatortype = local_locatortype;
}
diff --git a/src/backend/catalog/pgxc_class.c b/src/backend/catalog/pgxc_class.c
index 6d1cf0ed2a..1543a45342 100644
--- a/src/backend/catalog/pgxc_class.c
+++ b/src/backend/catalog/pgxc_class.c
@@ -23,9 +23,13 @@
#include "pgxc/locator.h"
#include "utils/array.h"
+/*
+ * PgxcClassCreate
+ * Create a pgxc_class entry
+ */
void
PgxcClassCreate(Oid pcrelid,
- char pclocatortype,
+ char pclocatortype,
int pcattnum,
int pchashalgorithm,
int pchashbuckets,
@@ -42,7 +46,7 @@ PgxcClassCreate(Oid pcrelid,
/* Build array of Oids to be inserted */
nodes_array = buildoidvector(nodes, numnodes);
- /* Iterate through edb_linkauth attributes initializing nulls and values */
+ /* Iterate through attributes initializing nulls and values */
for (i = 0; i < Natts_pgxc_class; i++)
{
nulls[i] = false;
@@ -81,6 +85,102 @@ PgxcClassCreate(Oid pcrelid,
heap_close(pgxcclassrel, RowExclusiveLock);
}
+
+/*
+ * PgxcClassAlter
+ * Modify a pgxc_class entry with given data
+ */
+void
+PgxcClassAlter(Oid pcrelid,
+ char pclocatortype,
+ int pcattnum,
+ int pchashalgorithm,
+ int pchashbuckets,
+ int numnodes,
+ Oid *nodes,
+ PgxcClassAlterType type)
+{
+ Relation rel;
+ HeapTuple oldtup, newtup;
+ oidvector *nodes_array;
+ Datum new_record[Natts_pgxc_class];
+ bool new_record_nulls[Natts_pgxc_class];
+ bool new_record_repl[Natts_pgxc_class];
+
+ Assert(OidIsValid(pcrelid));
+
+ rel = heap_open(PgxcClassRelationId, RowExclusiveLock);
+ oldtup = SearchSysCacheCopy1(PGXCCLASSRELID,
+ ObjectIdGetDatum(pcrelid));
+
+ if (!HeapTupleIsValid(oldtup)) /* should not happen */
+ elog(ERROR, "cache lookup failed for pgxc_class %u", pcrelid);
+
+ /* Build array of Oids to be inserted */
+ nodes_array = buildoidvector(nodes, numnodes);
+
+ /* Initialize fields */
+ MemSet(new_record, 0, sizeof(new_record));
+ MemSet(new_record_nulls, false, sizeof(new_record_nulls));
+ MemSet(new_record_repl, false, sizeof(new_record_repl));
+
+ /* Fields are updated depending on operation type */
+ switch (type)
+ {
+ case PGXC_CLASS_ALTER_DISTRIBUTION:
+ new_record_repl[Anum_pgxc_class_pclocatortype - 1] = true;
+ new_record_repl[Anum_pgxc_class_pcattnum - 1] = true;
+ new_record_repl[Anum_pgxc_class_pchashalgorithm - 1] = true;
+ new_record_repl[Anum_pgxc_class_pchashbuckets - 1] = true;
+ break;
+ case PGXC_CLASS_ALTER_NODES:
+ new_record_repl[Anum_pgxc_class_nodes - 1] = true;
+ break;
+ case PGXC_CLASS_ALTER_ALL:
+ default:
+ new_record_repl[Anum_pgxc_class_pcrelid - 1] = true;
+ new_record_repl[Anum_pgxc_class_pclocatortype - 1] = true;
+ new_record_repl[Anum_pgxc_class_pcattnum - 1] = true;
+ new_record_repl[Anum_pgxc_class_pchashalgorithm - 1] = true;
+ new_record_repl[Anum_pgxc_class_pchashbuckets - 1] = true;
+ new_record_repl[Anum_pgxc_class_nodes - 1] = true;
+ }
+
+ /* Set up new fields */
+ /* Relation Oid */
+ if (new_record_repl[Anum_pgxc_class_pcrelid - 1])
+ new_record[Anum_pgxc_class_pcrelid - 1] = ObjectIdGetDatum(pcrelid);
+
+ /* Locator type */
+ if (new_record_repl[Anum_pgxc_class_pclocatortype - 1])
+ new_record[Anum_pgxc_class_pclocatortype - 1] = CharGetDatum(pclocatortype);
+
+ /* Attribute number of distribution column */
+ if (new_record_repl[Anum_pgxc_class_pcattnum - 1])
+ new_record[Anum_pgxc_class_pcattnum - 1] = UInt16GetDatum(pcattnum);
+
+ /* Hash algorithm type */
+ if (new_record_repl[Anum_pgxc_class_pchashalgorithm - 1])
+ new_record[Anum_pgxc_class_pchashalgorithm - 1] = UInt16GetDatum(pchashalgorithm);
+
+ /* Hash buckets */
+ if (new_record_repl[Anum_pgxc_class_pchashbuckets - 1])
+ new_record[Anum_pgxc_class_pchashbuckets - 1] = UInt16GetDatum(pchashbuckets);
+
+ /* Node information */
+ if (new_record_repl[Anum_pgxc_class_nodes - 1])
+ new_record[Anum_pgxc_class_nodes - 1] = PointerGetDatum(nodes_array);
+
+ /* Update relation */
+ newtup = heap_modify_tuple(oldtup, RelationGetDescr(rel),
+ new_record,
+ new_record_nulls, new_record_repl);
+ simple_heap_update(rel, &oldtup->t_self, newtup);
+ CatalogUpdateIndexes(rel, newtup);
+
+ heap_close(rel, RowExclusiveLock);
+}
+
/*
* RemovePGXCClass():
* Remove extended PGXC information
@@ -108,5 +208,3 @@ RemovePgxcClass(Oid pcrelid)
heap_close(relation, RowExclusiveLock);
}
-
-
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 074bf09b39..41e77bc39c 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -1700,15 +1700,26 @@ CopyTo(CopyState cstate)
cstate->remoteCopyState->rel_loc)
{
RemoteCopyData *remoteCopyState = cstate->remoteCopyState;
+ RemoteCopyType remoteCopyType;
+
+ /* Set up remote COPY to correct operation */
+ if (cstate->copy_dest == COPY_FILE)
+ remoteCopyType = REMOTE_COPY_FILE;
+ else
+ remoteCopyType = REMOTE_COPY_STDOUT;
/*
* We don't know the value of the distribution column value, so need to
* read from all nodes. Hence indicate that the value is NULL.
*/
- processed = DataNodeCopyOut(
- GetRelationNodes(remoteCopyState->rel_loc, 0, true, UNKNOWNOID, RELATION_ACCESS_READ),
- remoteCopyState->connections,
- cstate->copy_file);
+ processed = DataNodeCopyOut(GetRelationNodes(remoteCopyState->rel_loc, 0,
+ true, UNKNOWNOID,
+ RELATION_ACCESS_READ),
+ remoteCopyState->connections,
+ NULL,
+ cstate->copy_file,
+ NULL,
+ remoteCopyType);
}
else
{
@@ -4289,9 +4300,8 @@ CreateCopyDestReceiver(void)
static RemoteCopyOptions *
GetRemoteCopyOptions(CopyState cstate)
{
- RemoteCopyOptions *res;
+ RemoteCopyOptions *res = makeRemoteCopyOptions();
Assert(cstate);
- res = (RemoteCopyOptions *) palloc0(sizeof(RemoteCopyOptions));
/* Then fill in structure */
res->rco_binary = cstate->binary;
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index b3aaa88541..2cf1ec71b2 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -89,8 +89,11 @@
#ifdef PGXC
#include "pgxc/pgxc.h"
#include "access/gtm.h"
+#include "catalog/pgxc_class.h"
+#include "catalog/pgxc_node.h"
#include "commands/sequence.h"
#include "pgxc/execRemote.h"
+#include "pgxc/redistrib.h"
#endif
/*
@@ -139,7 +142,12 @@ static List *on_commits = NIL;
#define AT_PASS_ADD_INDEX 6 /* ADD indexes */
#define AT_PASS_ADD_CONSTR 7 /* ADD constraints, defaults */
#define AT_PASS_MISC 8 /* other stuff */
+#ifdef PGXC
+#define AT_PASS_DISTRIB 9 /* Redistribution pass */
+#define AT_NUM_PASSES 10
+#else
#define AT_NUM_PASSES 9
+#endif
typedef struct AlteredTableInfo
{
@@ -375,7 +383,14 @@ static void ATExecAddOf(Relation rel, const TypeName *ofTypename, LOCKMODE lockm
static void ATExecDropOf(Relation rel, LOCKMODE lockmode);
static void ATExecGenericOptions(Relation rel, List *options);
#ifdef PGXC
+static void AtExecDistributeBy(Relation rel, DistributeBy *options);
+static void AtExecSubCluster(Relation rel, PGXCSubCluster *options);
+static void AtExecAddNode(Relation rel, List *options);
+static void AtExecDeleteNode(Relation rel, List *options);
static void ATCheckCmd(Relation rel, AlterTableCmd *cmd);
+static RedistribState *BuildRedistribCommands(Oid relid, List *subCmds);
+static Oid *delete_node_list(Oid *old_oids, int old_num, Oid *del_oids, int del_num, int *new_num);
+static Oid *add_node_list(Oid *old_oids, int old_num, Oid *add_oids, int add_num, int *new_num);
#endif
static void copy_relation_data(SMgrRelation rel, SMgrRelation dst,
@@ -620,7 +635,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId)
#ifdef PGXC
/*
* Add to pgxc_class.
- * we need to do this after CommandCounterIncrement
+ * we need to do this after CommandCounterIncrement
*/
if (IS_PGXC_COORDINATOR && relkind == RELKIND_RELATION)
{
@@ -2509,7 +2524,17 @@ CheckTableNotInUse(Relation rel, const char *stmt)
* lock level we want as we recurse may well be higher than required for
* that specific subcommand. So we pass down the overall lock requirement,
* rather than reassess it at lower levels.
+ *
+ */
+#ifdef PGXC
+/*
+ * In Postgres-XC, an extension is added to ALTER TABLE for modification
+ * of the data distribution. Depending on the old and new distribution type
+ * of the relation redistributed, a list of redistribution subcommands is built.
+ * Data redistribution cannot be done in parallel of operations that need
+ * the table to be rewritten like column addition/deletion.
*/
+#endif
void
AlterTable(AlterTableStmt *stmt)
{
@@ -2696,6 +2721,15 @@ AlterTableGetLockLevel(List *cmds)
cmd_lockmode = AccessExclusiveLock;
break;
+#ifdef PGXC
+ case AT_DistributeBy: /* Changes table distribution type */
+ case AT_SubCluster: /* Changes node list of distribution */
+ case AT_AddNodeList: /* Adds nodes in distribution */
+ case AT_DeleteNodeList: /* Deletes nodes in distribution */
+ cmd_lockmode = ExclusiveLock;
+ break;
+#endif
+
/*
* These subcommands affect write operations only.
*/
@@ -2819,6 +2853,9 @@ ATController(Relation rel, List *cmds, bool recurse, LOCKMODE lockmode)
{
List *wqueue = NIL;
ListCell *lcmd;
+#ifdef PGXC
+ RedistribState *redistribState = NULL;
+#endif
/* Phase 1: preliminary examination of commands, create work queue */
foreach(lcmd, cmds)
@@ -2833,12 +2870,82 @@ ATController(Relation rel, List *cmds, bool recurse, LOCKMODE lockmode)
ATPrepCmd(&wqueue, rel, cmd, recurse, false, lockmode);
}
+#ifdef PGXC
+ /* Only check that on local Coordinator */
+ if (IS_PGXC_COORDINATOR && !IsConnFromCoord())
+ {
+ ListCell *ltab;
+
+ /*
+ * Redistribution is only applied to the parent table and not subsequent
+ * children. It is also not applied in recursion. This needs to be done
+ * once all the commands have been treated.
+ */
+ foreach(ltab, wqueue)
+ {
+ AlteredTableInfo *tab = (AlteredTableInfo *) lfirst(ltab);
+
+ if (RelationGetRelid(rel) == tab->relid &&
+ list_length(tab->subcmds[AT_PASS_DISTRIB]) > 0)
+ {
+ /*
+ * Check if there are any commands incompatible
+ * with redistribution. For the time being no other commands
+ * are authorized.
+ */
+ if (list_length(tab->subcmds[AT_PASS_ADD_COL]) > 0 ||
+ list_length(tab->subcmds[AT_PASS_DROP]) > 0 ||
+ list_length(tab->subcmds[AT_PASS_ALTER_TYPE]) > 0 ||
+ list_length(tab->subcmds[AT_PASS_OLD_CONSTR]) > 0 ||
+ list_length(tab->subcmds[AT_PASS_COL_ATTRS]) > 0 ||
+ list_length(tab->subcmds[AT_PASS_ADD_COL]) > 0 ||
+ list_length(tab->subcmds[AT_PASS_ADD_INDEX]) > 0 ||
+ list_length(tab->subcmds[AT_PASS_ADD_CONSTR]) > 0 ||
+ list_length(tab->subcmds[AT_PASS_MISC]) > 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ errmsg("Incompatible operation with data redistribution")));
+
+
+ /* Scan redistribution commands and improve operation */
+ redistribState = BuildRedistribCommands(RelationGetRelid(rel),
+ tab->subcmds[AT_PASS_DISTRIB]);
+ break;
+ }
+ }
+ }
+#endif
+
/* Close the relation, but keep lock until commit */
relation_close(rel, NoLock);
+#ifdef PGXC
+ /* Perform pre-catalog-update redistribution operations */
+ PGXCRedistribTable(redistribState, CATALOG_UPDATE_BEFORE);
+#endif
+
/* Phase 2: update system catalogs */
ATRewriteCatalogs(&wqueue, lockmode);
+#ifdef PGXC
+ /* Invalidate cache for redistributed relation */
+ if (redistribState)
+ {
+ Relation rel2 = relation_open(redistribState->relid, NoLock);
+
+ /* Invalidate all entries related to this relation */
+ CacheInvalidateRelcache(rel2);
+
+ /* Make sure locator info is rebuilt */
+ RelationCacheInvalidateEntry(redistribState->relid);
+ relation_close(rel2, NoLock);
+ }
+
+ /* Perform post-catalog-update redistribution operations */
+ PGXCRedistribTable(redistribState, CATALOG_UPDATE_AFTER);
+ FreeRedistribState(redistribState);
+#endif
+
/* Phase 3: scan/rewrite tables as needed */
ATRewriteTables(&wqueue, lockmode);
}
@@ -3060,6 +3167,16 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd,
/* No command-specific prep needed */
pass = AT_PASS_MISC;
break;
+#ifdef PGXC
+ case AT_DistributeBy:
+ case AT_SubCluster:
+ case AT_AddNodeList:
+ case AT_DeleteNodeList:
+ ATSimplePermissions(rel, ATT_TABLE);
+ /* No command-specific prep needed */
+ pass = AT_PASS_DISTRIB;
+ break;
+#endif
default: /* oops */
elog(ERROR, "unrecognized alter table type: %d",
(int) cmd->subtype);
@@ -3327,6 +3444,20 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
case AT_GenericOptions:
ATExecGenericOptions(rel, (List *) cmd->def);
break;
+#ifdef PGXC
+ case AT_DistributeBy:
+ AtExecDistributeBy(rel, (DistributeBy *) cmd->def);
+ break;
+ case AT_SubCluster:
+ AtExecSubCluster(rel, (PGXCSubCluster *) cmd->def);
+ break;
+ case AT_AddNodeList:
+ AtExecAddNode(rel, (List *) cmd->def);
+ break;
+ case AT_DeleteNodeList:
+ AtExecDeleteNode(rel, (List *) cmd->def);
+ break;
+#endif
default: /* oops */
elog(ERROR, "unrecognized alter table type: %d",
(int) cmd->subtype);
@@ -3353,6 +3484,17 @@ ATRewriteTables(List **wqueue, LOCKMODE lockmode)
{
AlteredTableInfo *tab = (AlteredTableInfo *) lfirst(ltab);
+#ifdef PGXC
+ /* Forbid table rewrite operations with online data redistribution */
+ if (tab->rewrite &&
+ list_length(tab->subcmds[AT_PASS_DISTRIB]) > 0 &&
+ IS_PGXC_COORDINATOR &&
+ !IsConnFromCoord())
+ ereport(ERROR,
+ (errcode(ERRCODE_STATEMENT_TOO_COMPLEX),
+ errmsg("Incompatible operation with data redistribution")));
+#endif
+
/* Foreign tables have no storage. */
if (tab->relkind == RELKIND_FOREIGN_TABLE)
continue;
@@ -3464,7 +3606,7 @@ ATRewriteTables(List **wqueue, LOCKMODE lockmode)
}
#ifdef PGXC
- /*
+ /*
* In PGXC, do not check the FK constraints on the Coordinator, and just return
* That is because a SELECT is generated whose plan will try and use
* the Datanodes. We (currently) do not want to do that on the Coordinator,
@@ -9180,8 +9322,179 @@ ATExecGenericOptions(Relation rel, List *options)
#ifdef PGXC
/*
+ * ALTER TABLE <name> DISTRIBUTE BY ...
+ */
+static void
+AtExecDistributeBy(Relation rel, DistributeBy *options)
+{
+ Oid relid;
+ char locatortype;
+ int hashalgorithm, hashbuckets;
+ AttrNumber attnum;
+
+ /* Nothing to do on Datanodes */
+ if (IS_PGXC_DATANODE || options == NULL)
+ return;
+
+ relid = RelationGetRelid(rel);
+
+ /* Get necessary distribution information */
+ GetRelationDistributionItems(relid,
+ options,
+ RelationGetDescr(rel),
+ &locatortype,
+ &hashalgorithm,
+ &hashbuckets,
+ &attnum);
+
+ /*
+ * It is not checked if the distribution type list is the same as the old one,
+ * user might define a different sub-cluster at the same time.
+ */
+
+ /* Update pgxc_class entry */
+ PgxcClassAlter(relid,
+ locatortype,
+ (int) attnum,
+ hashalgorithm,
+ hashbuckets,
+ 0,
+ NULL,
+ PGXC_CLASS_ALTER_DISTRIBUTION);
+
+ /* Make the additional catalog changes visible */
+ CommandCounterIncrement();
+}
+
+
+/*
+ * ALTER TABLE <name> TO [ NODE nodelist | GROUP groupname ]
+ */
+static void
+AtExecSubCluster(Relation rel, PGXCSubCluster *options)
+{
+ Oid *nodeoids;
+ int numnodes;
+
+ /* Nothing to do on Datanodes */
+ if (IS_PGXC_DATANODE || options == NULL)
+ return;
+
+ /*
+ * It is not checked if the new subcluster list is the same as the old one,
+ * user might define a different distribution type.
+ */
+
+ /* Obtain new node information */
+ nodeoids = GetRelationDistributionNodes(options, &numnodes);
+
+ /* Update pgxc_class entry */
+ PgxcClassAlter(RelationGetRelid(rel),
+ '\0',
+ 0,
+ 0,
+ 0,
+ numnodes,
+ nodeoids,
+ PGXC_CLASS_ALTER_NODES);
+
+ /* Make the additional catalog changes visible */
+ CommandCounterIncrement();
+}
+
+
+/*
+ * ALTER TABLE <name> ADD NODE nodelist
+ */
+static void
+AtExecAddNode(Relation rel, List *options)
+{
+ Oid *add_oids, *old_oids;
+ int add_num, old_num;
+
+ /* Nothing to do on Datanodes */
+ if (IS_PGXC_DATANODE || options == NIL)
+ return;
+
+ /*
+ * Build a new array of sorted node Oids given the list of name nodes
+ * to be added.
+ */
+ add_oids = BuildRelationDistributionNodes(options, &add_num);
+
+ /*
+ * Then check if nodes to be added are not in existing node
+ * list and build updated list of nodes.
+ */
+ old_num = get_pgxc_classnodes(RelationGetRelid(rel), &old_oids);
+
+ /* Add elements to array */
+ old_oids = add_node_list(old_oids, old_num, add_oids, add_num, &old_num);
+
+ /* Sort once again the newly-created array of node Oids to maintain consistency */
+ old_oids = SortRelationDistributionNodes(old_oids, old_num);
+
+ /* Update pgxc_class entry */
+ PgxcClassAlter(RelationGetRelid(rel),
+ '\0',
+ 0,
+ 0,
+ 0,
+ old_num,
+ old_oids,
+ PGXC_CLASS_ALTER_NODES);
+
+ /* Make the additional catalog changes visible */
+ CommandCounterIncrement();
+}
+
+
+/*
+ * ALTER TABLE <name> DELETE NODE nodelist
+ */
+static void
+AtExecDeleteNode(Relation rel, List *options)
+{
+ Oid *del_oids, *old_oids;
+ int del_num, old_num;
+
+ /* Nothing to do on Datanodes */
+ if (IS_PGXC_DATANODE || options == NIL)
+ return;
+
+ /*
+ * Build a new array of sorted node Oids given the list of name nodes
+ * to be deleted.
+ */
+ del_oids = BuildRelationDistributionNodes(options, &del_num);
+
+ /*
+ * Check if nodes to be deleted are really included in existing
+ * node list and get updated list of nodes.
+ */
+ old_num = get_pgxc_classnodes(RelationGetRelid(rel), &old_oids);
+
+ /* Delete elements on array */
+ old_oids = delete_node_list(old_oids, old_num, del_oids, del_num, &old_num);
+
+ /* Update pgxc_class entry */
+ PgxcClassAlter(RelationGetRelid(rel),
+ '\0',
+ 0,
+ 0,
+ 0,
+ old_num,
+ old_oids,
+ PGXC_CLASS_ALTER_NODES);
+
+ /* Make the additional catalog changes visible */
+ CommandCounterIncrement();
+}
+
+
+/*
* ATCheckCmd
- *
+ *
* Check ALTER TABLE restrictions in Postgres-XC
*/
static void
@@ -9205,6 +9518,218 @@ ATCheckCmd(Relation rel, AlterTableCmd *cmd)
break;
}
}
+
+
+/*
+ * BuildRedistribCommands
+ * Evaluate new and old distribution and build the list of operations
+ * necessary to perform table redistribution.
+ */
+static RedistribState *
+BuildRedistribCommands(Oid relid, List *subCmds)
+{
+ RedistribState *redistribState = makeRedistribState(relid);
+ RelationLocInfo *oldLocInfo, *newLocInfo; /* Former locator info */
+ Relation rel;
+ Oid *new_oid_array; /* Modified list of Oids */
+ int new_num, i; /* Modified number of Oids */
+ ListCell *item;
+
+ /* Get necessary information about relation */
+ rel = relation_open(redistribState->relid, NoLock);
+ oldLocInfo = RelationGetLocInfo(rel);
+ Assert(oldLocInfo);
+
+ /*
+ * Get a copy of the locator information that will be modified by
+ * successive ALTER TABLE commands.
+ */
+ newLocInfo = CopyRelationLocInfo(oldLocInfo);
+ /* The node list of this locator information will be rebuilt after command scan */
+ list_free(newLocInfo->nodeList);
+ newLocInfo->nodeList = NULL;
+
+ /* Get the list to be modified */
+ new_num = get_pgxc_classnodes(RelationGetRelid(rel), &new_oid_array);
+
+ foreach(item, subCmds)
+ {
+ AlterTableCmd *cmd = (AlterTableCmd *) lfirst(item);
+ switch (cmd->subtype)
+ {
+ case AT_DistributeBy:
+ /*
+ * Get necessary distribution information and update to new
+ * distribution type.
+ */
+ GetRelationDistributionItems(redistribState->relid,
+ (DistributeBy *) cmd->def,
+ RelationGetDescr(rel),
+ &(newLocInfo->locatorType),
+ NULL,
+ NULL,
+ (AttrNumber *)&(newLocInfo->partAttrNum));
+ break;
+ case AT_SubCluster:
+ /* Update new list of nodes */
+ new_oid_array = GetRelationDistributionNodes((PGXCSubCluster *) cmd->def, &new_num);
+ break;
+ case AT_AddNodeList:
+ {
+ Oid *add_oids;
+ int add_num;
+ add_oids = BuildRelationDistributionNodes((List *) cmd->def, &add_num);
+ /* Add elements to array */
+ new_oid_array = add_node_list(new_oid_array, new_num, add_oids, add_num, &new_num);
+ }
+ break;
+ case AT_DeleteNodeList:
+ {
+ Oid *del_oids;
+ int del_num;
+ del_oids = BuildRelationDistributionNodes((List *) cmd->def, &del_num);
+ /* Delete elements from array */
+ new_oid_array = delete_node_list(new_oid_array, new_num, del_oids, del_num, &new_num);
+ }
+ break;
+ default:
+ Assert(0); /* Should not happen */
+ }
+ }
+
+ /* Build relation node list for new locator info */
+ for (i = 0; i < new_num; i++)
+ newLocInfo->nodeList = lappend_int(newLocInfo->nodeList,
+ PGXCNodeGetNodeId(new_oid_array[i],
+ PGXC_NODE_DATANODE));
+
+ /* Build the command tree for table redistribution */
+ PGXCRedistribCreateCommandList(redistribState, newLocInfo);
+
+ /* Clean up */
+ FreeRelationLocInfo(newLocInfo);
+ pfree(new_oid_array);
+ relation_close(rel, NoLock);
+
+ return redistribState;
+}
+
+
+/*
+ * Delete from given Oid array old_oids the given oid list del_oids
+ * and build a new one.
+ */
+Oid *
+delete_node_list(Oid *old_oids, int old_num, Oid *del_oids, int del_num, int *new_num)
+{
+ /* Allocate former array and data */
+ Oid *new_oids = old_oids;
+ int loc_new_num = old_num;
+ int i;
+
+ /*
+ * Delete from existing node Oid array the elements to be removed.
+ * An error is returned if an element to be deleted is not in existing array.
+ * It is not necessary to sort once again the result array of node Oids
+ * as here only a deletion of elements is done.
+ */
+ for (i = 0; i < del_num; i++)
+ {
+ Oid nodeoid = del_oids[i];
+ int j, position;
+ bool is_listed = false;
+ position = 0;
+
+ for (j = 0; j < loc_new_num; j++)
+ {
+ /* Check if element can be removed */
+ if (nodeoid == new_oids[j])
+ {
+ is_listed = true;
+ position = j;
+ }
+ }
+
+ /* Move all the elements from [j+1, n-1] to [j, n-2] */
+ if (is_listed)
+ {
+ for (j = position + 1; j < loc_new_num; j++)
+ new_oids[j - 1] = new_oids[j];
+
+ loc_new_num--;
+
+ /* Not possible to have an empty list */
+ if (loc_new_num == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_OBJECT),
+ errmsg("Node list is empty: one node at least is mandatory")));
+
+ new_oids = (Oid *) repalloc(new_oids, loc_new_num * sizeof(Oid));
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_OBJECT),
+ errmsg("PGXC Node %s: object not in relation node list",
+ get_pgxc_nodename(nodeoid))));
+ }
+
+ /* Save new number of nodes */
+ *new_num = loc_new_num;
+ return new_oids;
+}
+
+
+/*
+ * Add to given Oid array old_oids the given oid list add_oids
+ * and build a new one.
+ */
+Oid *
+add_node_list(Oid *old_oids, int old_num, Oid *add_oids, int add_num, int *new_num)
+{
+ /* Allocate former array and data */
+ Oid *new_oids = old_oids;
+ int loc_new_num = old_num;
+ int i;
+
+ /*
+ * Build new Oid list, both addition and old list are already sorted.
+ * The idea here is to go through the list of nodes to be added and
+ * add the elements one-by-one on the existing list.
+ * An error is returned if an element to be added already exists
+ * in relation node array.
+ * Here we do O(n^2) scan to avoid a dependency with the way
+ * oids are sorted by heap APIs. They are sorted once again once
+ * the addition operation is completed.
+ */
+ for (i = 0; i < add_num; i++)
+ {
+ Oid nodeoid = add_oids[i];
+ int j;
+
+ /* Check if element is already a part of array */
+ for (j = 0; j < loc_new_num; j++)
+ {
+ /* Item is already in node list */
+ if (nodeoid == new_oids[j])
+ ereport(ERROR,
+ (errcode(ERRCODE_DUPLICATE_OBJECT),
+ errmsg("PGXC Node %s: object already in relation node list",
+ get_pgxc_nodename(nodeoid))));
+ }
+
+ /* If we are here, element can be added safely in node array */
+ loc_new_num++;
+ new_oids = (Oid *) repalloc(new_oids, loc_new_num * sizeof(Oid));
+ new_oids[loc_new_num - 1] = nodeoid;
+ }
+
+ /* Sort once again the newly-created array of node Oids to maintain consistency */
+ new_oids = SortRelationDistributionNodes(new_oids, loc_new_num);
+
+ /* Save new number of nodes */
+ *new_num = loc_new_num;
+ return new_oids;
+}
#endif
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index c87fdbf3d9..7b6050e4f4 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -2038,6 +2038,40 @@ alter_table_cmd:
n->def = (Node *)$1;
$$ = (Node *) n;
}
+/* PGXC_BEGIN */
+ /* ALTER TABLE <name> DISTRIBUTE BY ... */
+ | OptDistributeByInternal
+ {
+ AlterTableCmd *n = makeNode(AlterTableCmd);
+ n->subtype = AT_DistributeBy;
+ n->def = (Node *)$1;
+ $$ = (Node *)n;
+ }
+ /* ALTER TABLE <name> TO [ NODE (nodelist) | GROUP groupname ] */
+ | OptSubClusterInternal
+ {
+ AlterTableCmd *n = makeNode(AlterTableCmd);
+ n->subtype = AT_SubCluster;
+ n->def = (Node *)$1;
+ $$ = (Node *)n;
+ }
+ /* ALTER TABLE <name> ADD NODE (nodelist) */
+ | ADD_P NODE pgxcnodes
+ {
+ AlterTableCmd *n = makeNode(AlterTableCmd);
+ n->subtype = AT_AddNodeList;
+ n->def = (Node *)$3;
+ $$ = (Node *)n;
+ }
+ /* ALTER TABLE <name> DELETE NODE (nodelist) */
+ | DELETE_P NODE pgxcnodes
+ {
+ AlterTableCmd *n = makeNode(AlterTableCmd);
+ n->subtype = AT_DeleteNodeList;
+ n->def = (Node *)$3;
+ $$ = (Node *)n;
+ }
+/* PGXC_END */
;
alter_column_default:
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index edd4a104e0..f98e6ea59d 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -93,7 +93,8 @@ typedef struct
IndexStmt *pkey; /* PRIMARY KEY index, if any */
#ifdef PGXC
char *fallback_dist_col; /* suggested column to distribute on */
- DistributeBy *distributeby; /* original distribute by column in create table */
+ DistributeBy *distributeby; /* original distribute by column of CREATE TABLE */
+ PGXCSubCluster *subcluster; /* original subcluster option of CREATE TABLE */
#endif
} CreateStmtContext;
@@ -2415,6 +2416,7 @@ transformAlterTableStmt(AlterTableStmt *stmt, const char *queryString)
#ifdef PGXC
cxt.fallback_dist_col = NULL;
cxt.distributeby = NULL;
+ cxt.subcluster = NULL;
#endif
/*
diff --git a/src/backend/pgxc/copy/Makefile b/src/backend/pgxc/copy/Makefile
index a8cfbd86da..2ddcc904b3 100644
--- a/src/backend/pgxc/copy/Makefile
+++ b/src/backend/pgxc/copy/Makefile
@@ -14,6 +14,6 @@ subdir = src/backend/pgxc/copy
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = remotecopy.o
+OBJS = copyops.o remotecopy.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/copy/copyops.c b/src/backend/pgxc/copy/copyops.c
new file mode 100644
index 0000000000..a85a06cc09
--- /dev/null
+++ b/src/backend/pgxc/copy/copyops.c
@@ -0,0 +1,496 @@
+/*-------------------------------------------------------------------------
+ *
+ * copyops.c
+ * Functions related to remote COPY data manipulation and materialization
+ * of data redistribution
+ *
+ * Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/pgxc/copy/copyops.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+#include "fmgr.h"
+#include "lib/stringinfo.h"
+#include "mb/pg_wchar.h"
+#include "pgxc/copyops.h"
+#include "utils/lsyscache.h"
+
+/* NULL print marker */
+#define COPYOPS_NULL_PRINT "\\N"
+
+/* Some octal operations */
+#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
+#define OCTVALUE(c) ((c) - '0')
+/* Send text representation of one attribute, with conversion and escaping */
+#define DUMPSOFAR() \
+ do { \
+ if (ptr > start) \
+ appendBinaryStringInfo(buf, (char *) start, ptr - start); \
+ } while (0)
+
+
+static int get_decimal_from_hex(char hex);
+static void attribute_out_text(StringInfo buf, char *string);
+
+/*
+ * Return decimal value for a hexadecimal digit
+ */
+static int
+get_decimal_from_hex(char hex)
+{
+ if (isdigit((unsigned char) hex))
+ return hex - '0';
+ else
+ return tolower((unsigned char) hex) - 'a' + 10;
+}
+
+
+/*
+ * Output an attribute to text
+ * This takes portions of the code of CopyAttributeOutText
+ */
+static void
+attribute_out_text(StringInfo buf, char *string)
+{
+ char *ptr;
+ char c;
+ char *start;
+ char delimc = COPYOPS_DELIMITER;
+ bool need_transcoding, encoding_embeds_ascii;
+ int file_encoding = pg_get_client_encoding();
+
+ need_transcoding = (file_encoding != GetDatabaseEncoding() ||
+ pg_database_encoding_max_length() > 1);
+ encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(file_encoding);
+
+ if (need_transcoding)
+ ptr = pg_server_to_any(string, strlen(string), file_encoding);
+ else
+ ptr = string;
+
+ /*
+ * We have to grovel through the string searching for control characters
+ * and instances of the delimiter character. In most cases, though, these
+ * are infrequent. To avoid overhead from calling CopySendData once per
+ * character, we dump out all characters between escaped characters in a
+ * single call. The loop invariant is that the data from "start" to "ptr"
+ * can be sent literally, but hasn't yet been.
+ *
+ * We can skip pg_encoding_mblen() overhead when encoding is safe, because
+ * in valid backend encodings, extra bytes of a multibyte character never
+ * look like ASCII. This loop is sufficiently performance-critical that
+ * it's worth making two copies of it to get the IS_HIGHBIT_SET() test out
+ * of the normal safe-encoding path.
+ */
+ if (encoding_embeds_ascii)
+ {
+ start = ptr;
+ while ((c = *ptr) != '\0')
+ {
+ if ((unsigned char) c < (unsigned char) 0x20)
+ {
+ /*
+ * \r and \n must be escaped, the others are traditional. We
+ * prefer to dump these using the C-like notation, rather than
+ * a backslash and the literal character, because it makes the
+ * dump file a bit more proof against Microsoftish data
+ * mangling.
+ */
+ switch (c)
+ {
+ case '\b':
+ c = 'b';
+ break;
+ case '\f':
+ c = 'f';
+ break;
+ case '\n':
+ c = 'n';
+ break;
+ case '\r':
+ c = 'r';
+ break;
+ case '\t':
+ c = 't';
+ break;
+ case '\v':
+ c = 'v';
+ break;
+ default:
+ /* If it's the delimiter, must backslash it */
+ if (c == delimc)
+ break;
+ /* All ASCII control chars are length 1 */
+ ptr++;
+ continue; /* fall to end of loop */
+ }
+
+ /* if we get here, we need to convert the control char */
+ DUMPSOFAR();
+ appendStringInfoCharMacro(buf, '\\');
+ appendStringInfoCharMacro(buf, c);
+ start = ++ptr;
+ }
+ else if (c == '\\' || c == delimc)
+ {
+ DUMPSOFAR();
+ appendStringInfoCharMacro(buf, '\\');
+ start = ++ptr;
+ }
+ else if (IS_HIGHBIT_SET(c))
+ ptr += pg_encoding_mblen(file_encoding, ptr);
+ else
+ ptr++;
+ }
+ }
+ else
+ {
+ start = ptr;
+ while ((c = *ptr) != '\0')
+ {
+ if ((unsigned char) c < (unsigned char) 0x20)
+ {
+ /*
+ * \r and \n must be escaped, the others are traditional. We
+ * prefer to dump these using the C-like notation, rather than
+ * a backslash and the literal character, because it makes the
+ * dump file a bit more proof against Microsoftish data
+ * mangling.
+ */
+ switch (c)
+ {
+ case '\b':
+ c = 'b';
+ break;
+ case '\f':
+ c = 'f';
+ break;
+ case '\n':
+ c = 'n';
+ break;
+ case '\r':
+ c = 'r';
+ break;
+ case '\t':
+ c = 't';
+ break;
+ case '\v':
+ c = 'v';
+ break;
+ default:
+ /* If it's the delimiter, must backslash it */
+ if (c == delimc)
+ break;
+ /* All ASCII control chars are length 1 */
+ ptr++;
+ continue; /* fall to end of loop */
+ }
+ /* if we get here, we need to convert the control char */
+ DUMPSOFAR();
+ appendStringInfoCharMacro(buf, '\\');
+ appendStringInfoCharMacro(buf, c);
+ start = ++ptr;
+ }
+ else if (c == '\\' || c == delimc)
+ {
+ DUMPSOFAR();
+ appendStringInfoCharMacro(buf, '\\');
+ start = ++ptr;
+ }
+ else
+ ptr++;
+ }
+ }
+
+ DUMPSOFAR();
+}
+
+
+/*
+ * CopyOps_RawDataToArrayField
+ * Convert the raw output of COPY TO to an array of fields.
+ * This is a simplified version of CopyReadAttributesText used for data
+ * redistribution and storage of tuple data into a tuple store.
+ */
+char **
+CopyOps_RawDataToArrayField(TupleDesc tupdesc, char *message, int len)
+{
+ char delimc = COPYOPS_DELIMITER;
+ int fieldno;
+ int null_print_len = strlen(COPYOPS_NULL_PRINT);
+ char *origin_ptr;
+ char *output_ptr;
+ char *cur_ptr;
+ char *line_end_ptr;
+ int fields = tupdesc->natts;
+ char **raw_fields;
+ Form_pg_attribute *attr = tupdesc->attrs;
+
+ /* Adjust number of fields depending on dropped attributes */
+ for (fieldno = 0; fieldno < tupdesc->natts; fieldno++)
+ {
+ if (attr[fieldno]->attisdropped)
+ fields--;
+ }
+
+ /* Then alloc necessary space */
+ raw_fields = (char **) palloc(fields * sizeof(char *));
+
+ /* Take a copy of message to manipulate */
+ origin_ptr = (char *) palloc0(sizeof(char) * (len + 1));
+ memcpy(origin_ptr, message, len + 1);
+
+ /* Add clean separator '\0' at the end of message */
+ origin_ptr[len] = '\0';
+
+ /* Keep track of original pointer */
+ output_ptr = origin_ptr;
+
+ /* set pointer variables for loop */
+ cur_ptr = message;
+ line_end_ptr = message + len;
+
+ /* Outer loop iterates over fields */
+ fieldno = 0;
+ for (;;)
+ {
+ char *start_ptr;
+ char *end_ptr;
+ int input_len;
+ bool found_delim = false;
+ bool saw_non_ascii = false;
+
+ /* Make sure there is enough space for the next value */
+ if (fieldno >= fields)
+ {
+ fields *= 2;
+ raw_fields = repalloc(raw_fields, fields * sizeof(char *));
+ }
+
+ /* Remember start of field on output side */
+ start_ptr = cur_ptr;
+ raw_fields[fieldno] = output_ptr;
+
+ /* Scan data for field */
+ for (;;)
+ {
+ char c;
+
+ end_ptr = cur_ptr;
+ if (cur_ptr >= line_end_ptr)
+ break;
+ c = *cur_ptr++;
+ if (c == delimc)
+ {
+ found_delim = true;
+ break;
+ }
+ if (c == '\\')
+ {
+ if (cur_ptr >= line_end_ptr)
+ break;
+ c = *cur_ptr++;
+ switch (c)
+ {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ {
+ /* handle \013 */
+ int val;
+
+ val = OCTVALUE(c);
+ if (cur_ptr < line_end_ptr)
+ {
+ c = *cur_ptr;
+ if (ISOCTAL(c))
+ {
+ cur_ptr++;
+ val = (val << 3) + OCTVALUE(c);
+ if (cur_ptr < line_end_ptr)
+ {
+ c = *cur_ptr;
+ if (ISOCTAL(c))
+ {
+ cur_ptr++;
+ val = (val << 3) + OCTVALUE(c);
+ }
+ }
+ }
+ }
+ c = val & 0377;
+ if (c == '\0' || IS_HIGHBIT_SET(c))
+ saw_non_ascii = true;
+ }
+ break;
+ case 'x':
+ /* Handle \x3F */
+ if (cur_ptr < line_end_ptr)
+ {
+ char hexchar = *cur_ptr;
+
+ if (isxdigit((unsigned char) hexchar))
+ {
+ int val = get_decimal_from_hex(hexchar);
+
+ cur_ptr++;
+ if (cur_ptr < line_end_ptr)
+ {
+ hexchar = *cur_ptr;
+ if (isxdigit((unsigned char) hexchar))
+ {
+ cur_ptr++;
+ val = (val << 4) + get_decimal_from_hex(hexchar);
+ }
+ }
+ c = val & 0xff;
+ if (c == '\0' || IS_HIGHBIT_SET(c))
+ saw_non_ascii = true;
+ }
+ }
+ break;
+ case 'b':
+ c = '\b';
+ break;
+ case 'f':
+ c = '\f';
+ break;
+ case 'n':
+ c = '\n';
+ break;
+ case 'r':
+ c = '\r';
+ break;
+ case 't':
+ c = '\t';
+ break;
+ case 'v':
+ c = '\v';
+ break;
+
+ /*
+ * in all other cases, take the char after '\'
+ * literally
+ */
+ }
+ }
+
+ /* Add c to output string */
+ *output_ptr++ = c;
+ }
+
+ /* Terminate attribute value in output area */
+ *output_ptr++ = '\0';
+
+ /*
+ * If we de-escaped a non-7-bit-ASCII char, make sure we still have
+ * valid data for the db encoding. Avoid calling strlen here for the
+ * sake of efficiency.
+ */
+ if (saw_non_ascii)
+ {
+ char *fld = raw_fields[fieldno];
+
+ pg_verifymbstr(fld, output_ptr - (fld + 1), false);
+ }
+
+ /* Check whether raw input matched null marker */
+ input_len = end_ptr - start_ptr;
+ if (input_len == null_print_len &&
+ strncmp(start_ptr, COPYOPS_NULL_PRINT, input_len) == 0)
+ raw_fields[fieldno] = NULL;
+
+ fieldno++;
+ /* Done if we hit EOL instead of a delim */
+ if (!found_delim)
+ break;
+ }
+
+ /* Clean up state of attribute_buf */
+ output_ptr--;
+ Assert(*output_ptr == '\0');
+
+ return raw_fields;
+}
+
+/*
+ * CopyOps_BuildOneRowTo
+ * Build one row message to be sent to remote nodes through COPY protocol
+ */
+char *
+CopyOps_BuildOneRowTo(TupleDesc tupdesc, Datum *values, bool *nulls, int *len)
+{
+ bool need_delim = false;
+ char *res;
+ int i;
+ FmgrInfo *out_functions;
+ Form_pg_attribute *attr = tupdesc->attrs;
+ StringInfo buf;
+
+ /* Get info about the columns we need to process. */
+ out_functions = (FmgrInfo *) palloc(tupdesc->natts * sizeof(FmgrInfo));
+ for (i = 0; i < tupdesc->natts; i++)
+ {
+ Oid out_func_oid;
+ bool isvarlena;
+
+ /* Do not need any information for dropped attributes */
+ if (attr[i]->attisdropped)
+ continue;
+
+ getTypeOutputInfo(attr[i]->atttypid,
+ &out_func_oid,
+ &isvarlena);
+ fmgr_info(out_func_oid, &out_functions[i]);
+ }
+
+ /* Initialize output buffer */
+ buf = makeStringInfo();
+
+ for (i = 0; i < tupdesc->natts; i++)
+ {
+ Datum value = values[i];
+ bool isnull = nulls[i];
+
+ /* Do not need any information for dropped attributes */
+ if (attr[i]->attisdropped)
+ continue;
+
+ if (need_delim)
+ appendStringInfoCharMacro(buf, COPYOPS_DELIMITER);
+ need_delim = true;
+
+ if (isnull)
+ {
+ /* Null print value to client */
+ appendBinaryStringInfo(buf, "\\N", strlen("\\N"));
+ }
+ else
+ {
+ char *string;
+ string = OutputFunctionCall(&out_functions[i],
+ value);
+ attribute_out_text(buf, string);
+ pfree(string);
+ }
+ }
+
+ /* Record length of message */
+ *len = buf->len;
+ res = pstrdup(buf->data);
+ pfree(out_functions);
+ pfree(buf->data);
+ pfree(buf);
+ return res;
+}
diff --git a/src/backend/pgxc/copy/remotecopy.c b/src/backend/pgxc/copy/remotecopy.c
index 8c3eba0bff..5c0299dc64 100644
--- a/src/backend/pgxc/copy/remotecopy.c
+++ b/src/backend/pgxc/copy/remotecopy.c
@@ -167,7 +167,6 @@ RemoteCopy_BuildStatement(RemoteCopyData *state,
else
appendStringInfoString(&state->query_buf, " TO STDOUT");
-
if (options->rco_binary)
appendStringInfoString(&state->query_buf, " BINARY");
@@ -201,7 +200,6 @@ RemoteCopy_BuildStatement(RemoteCopyData *state,
* It is not necessary to send the HEADER part to Datanodes.
* Sending data is sufficient.
*/
-
if (options->rco_quote && options->rco_quote[0] != '"')
{
appendStringInfoString(&state->query_buf, " QUOTE AS ");
@@ -245,6 +243,26 @@ RemoteCopy_BuildStatement(RemoteCopyData *state,
/*
+ * Build a default set for RemoteCopyOptions
+ */
+RemoteCopyOptions *
+makeRemoteCopyOptions(void)
+{
+ RemoteCopyOptions *res = (RemoteCopyOptions *) palloc(sizeof(RemoteCopyOptions));
+ res->rco_binary = false;
+ res->rco_oids = false;
+ res->rco_csv_mode = false;
+ res->rco_delim = NULL;
+ res->rco_null_print = NULL;
+ res->rco_quote = NULL;
+ res->rco_escape = NULL;
+ res->rco_force_quote = NIL;
+ res->rco_force_notnull = NIL;
+ return res;
+}
+
+
+/*
* FreeRemoteCopyOptions
* Free remote COPY options structure
*/
diff --git a/src/backend/pgxc/locator/Makefile b/src/backend/pgxc/locator/Makefile
index 107fe0f601..66c4c50d2d 100644
--- a/src/backend/pgxc/locator/Makefile
+++ b/src/backend/pgxc/locator/Makefile
@@ -1,7 +1,7 @@
#-------------------------------------------------------------------------
#
# Makefile--
-# Makefile for locator
+# Makefile for locator and data distribution
#
# Copyright(C) 2010-2012 Postgres-XC Development Group
#
@@ -14,6 +14,6 @@ subdir = src/backend/pgxc/locator
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = locator.o
+OBJS = locator.o redistrib.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c
index feab0a1f9e..b5b920a443 100644
--- a/src/backend/pgxc/locator/locator.c
+++ b/src/backend/pgxc/locator/locator.c
@@ -440,7 +440,6 @@ IsModuloColumn(RelationLocInfo *rel_loc_info, char *part_col_name)
/*
* IsModuloColumnForRelId - return whether or not column for relation is used for modulo distribution.
- *
*/
bool
IsModuloColumnForRelId(Oid relid, char *part_col_name)
@@ -502,6 +501,42 @@ IsTableDistOnPrimary(RelationLocInfo *rel_loc_info)
return false;
}
+
+/*
+ * IsLocatorInfoEqual
+ * Check equality of given locator information
+ */
+bool
+IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2)
+{
+ List *nodeList1, *nodeList2;
+ Assert(rel_loc_info1 && rel_loc_info2);
+
+ nodeList1 = rel_loc_info1->nodeList;
+ nodeList2 = rel_loc_info2->nodeList;
+
+ /* Same relation? */
+ if (rel_loc_info1->relid != rel_loc_info2->relid)
+ return false;
+
+ /* Same locator type? */
+ if (rel_loc_info1->locatorType != rel_loc_info2->locatorType)
+ return false;
+
+ /* Same attribute number? */
+ if (rel_loc_info1->partAttrNum != rel_loc_info2->partAttrNum)
+ return false;
+
+ /* Same node list? */
+ if (list_difference_int(nodeList1, nodeList2) != NIL ||
+ list_difference_int(nodeList2, nodeList1) != NIL)
+ return false;
+
+ /* Everything is equal */
+ return true;
+}
+
+
/*
* GetRelationNodes
*
diff --git a/src/backend/pgxc/locator/redistrib.c b/src/backend/pgxc/locator/redistrib.c
new file mode 100644
index 0000000000..264f01b1d1
--- /dev/null
+++ b/src/backend/pgxc/locator/redistrib.c
@@ -0,0 +1,871 @@
+/*-------------------------------------------------------------------------
+ *
+ * redistrib.c
+ * Routines related to online data redistribution
+ *
+ * Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/pgxc/locator/redistrib.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "miscadmin.h"
+
+#include "access/hash.h"
+#include "access/htup.h"
+#include "access/xact.h"
+#include "catalog/pg_type.h"
+#include "catalog/pgxc_node.h"
+#include "commands/tablecmds.h"
+#include "pgxc/copyops.h"
+#include "pgxc/execRemote.h"
+#include "pgxc/pgxc.h"
+#include "pgxc/redistrib.h"
+#include "pgxc/remotecopy.h"
+#include "utils/lsyscache.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+#define IsCommandTypePreUpdate(x) (x == CATALOG_UPDATE_BEFORE || \
+ x == CATALOG_UPDATE_BOTH)
+#define IsCommandTypePostUpdate(x) (x == CATALOG_UPDATE_AFTER || \
+ x == CATALOG_UPDATE_BOTH)
+
+/* Functions used for the execution of redistribution commands */
+static void distrib_execute_query(char *sql, bool is_temp, ExecNodes *exec_nodes);
+static void distrib_execute_command(RedistribState *distribState, RedistribCommand *command);
+static void distrib_copy_to(RedistribState *distribState);
+static void distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes);
+static void distrib_truncate(RedistribState *distribState, ExecNodes *exec_nodes);
+static void distrib_reindex(RedistribState *distribState, ExecNodes *exec_nodes);
+static void distrib_delete_hash(RedistribState *distribState, ExecNodes *exec_nodes);
+
+/* Functions used to build the command list */
+static void pgxc_redist_build_entry(RedistribState *distribState,
+ RelationLocInfo *oldLocInfo,
+ RelationLocInfo *newLocInfo);
+static void pgxc_redist_build_replicate(RedistribState *distribState,
+ RelationLocInfo *oldLocInfo,
+ RelationLocInfo *newLocInfo);
+static void pgxc_redist_build_replicate_to_distrib(RedistribState *distribState,
+ RelationLocInfo *oldLocInfo,
+ RelationLocInfo *newLocInfo);
+
+static void pgxc_redist_build_default(RedistribState *distribState);
+static void pgxc_redist_add_reindex(RedistribState *distribState);
+
+
+/*
+ * PGXCRedistribTable
+ * Execute redistribution operations after catalog update
+ */
+void
+PGXCRedistribTable(RedistribState *distribState, RedistribCatalog type)
+{
+ ListCell *item;
+
+ /* Nothing to do if no redistribution operation */
+ if (!distribState)
+ return;
+
+ /* Nothing to do if on remote node */
+ if (IS_PGXC_DATANODE || IsConnFromCoord())
+ return;
+
+ /* Execute each command if necessary */
+ foreach(item, distribState->commands)
+ {
+ RedistribCommand *command = (RedistribCommand *)lfirst(item);
+
+ /* Check if command can be run */
+ if (!IsCommandTypePostUpdate(type) &&
+ IsCommandTypePostUpdate(command->updateState))
+ continue;
+ if (!IsCommandTypePreUpdate(type) &&
+ IsCommandTypePreUpdate(command->updateState))
+ continue;
+
+ /* Now enter in execution list */
+ distrib_execute_command(distribState, command);
+ }
+}
+
+
+/*
+ * PGXCRedistribCreateCommandList
+ * Look for the list of necessary commands to perform table redistribution.
+ */
+void
+PGXCRedistribCreateCommandList(RedistribState *distribState, RelationLocInfo *newLocInfo)
+{
+ Relation rel;
+ RelationLocInfo *oldLocInfo;
+
+ rel = relation_open(distribState->relid, NoLock);
+ oldLocInfo = RelationGetLocInfo(rel);
+
+ /* Build redistribution command list */
+ pgxc_redist_build_entry(distribState, oldLocInfo, newLocInfo);
+
+ relation_close(rel, NoLock);
+}
+
+
+/*
+ * pgxc_redist_build_entry
+ * Entry point for command list building
+ */
+static void
+pgxc_redist_build_entry(RedistribState *distribState,
+ RelationLocInfo *oldLocInfo,
+ RelationLocInfo *newLocInfo)
+{
+ /* If distribution has not changed at all, nothing to do */
+ if (IsLocatorInfoEqual(oldLocInfo, newLocInfo))
+ return;
+
+ /* Evaluate cases for replicated tables */
+ pgxc_redist_build_replicate(distribState, oldLocInfo, newLocInfo);
+
+ /* Evaluate cases for replicated to distributed tables */
+ pgxc_redist_build_replicate_to_distrib(distribState, oldLocInfo, newLocInfo);
+
+ /* PGXCTODO: perform more complex builds of command list */
+
+ /* Fallback to default */
+ pgxc_redist_build_default(distribState);
+}
+
+
+/*
+ * pgxc_redist_build_replicate_to_distrib
+ * Build redistribution command list from replicated to distributed
+ * table.
+ */
+static void
+pgxc_redist_build_replicate_to_distrib(RedistribState *distribState,
+ RelationLocInfo *oldLocInfo,
+ RelationLocInfo *newLocInfo)
+{
+ List *removedNodes;
+ List *newNodes;
+
+ /* If a command list has already been built, nothing to do */
+ if (list_length(distribState->commands) != 0)
+ return;
+
+ /* Redistribution is done from replication to distributed (with value) */
+ if (!IsLocatorReplicated(oldLocInfo->locatorType) ||
+ !IsLocatorDistributedByValue(newLocInfo->locatorType))
+ return;
+
+ /* Get the list of nodes that are added to the relation */
+ removedNodes = list_difference_int(oldLocInfo->nodeList, newLocInfo->nodeList);
+
+ /* Get the list of nodes that are removed from relation */
+ newNodes = list_difference_int(newLocInfo->nodeList, oldLocInfo->nodeList);
+
+ /*
+ * If some nodes are added, turn back to default, we need to fetch data
+ * and then redistribute it properly.
+ */
+ if (newNodes != NIL)
+ return;
+
+ /* Nodes removed have to be truncated, so add a TRUNCATE commands to removed nodes */
+ if (removedNodes != NIL)
+ {
+ ExecNodes *execNodes = makeNode(ExecNodes);
+ execNodes->nodeList = removedNodes;
+ /* Add TRUNCATE command */
+ distribState->commands = lappend(distribState->commands,
+ makeRedistribCommand(DISTRIB_TRUNCATE, CATALOG_UPDATE_BEFORE, execNodes));
+ }
+
+ /*
+ * If the table is redistributed to a single node, a TRUNCATE on removed nodes
+ * is sufficient so leave here.
+ */
+ if (list_length(newLocInfo->nodeList) == 1)
+ {
+ /* Add REINDEX command if necessary */
+ pgxc_redist_add_reindex(distribState);
+ return;
+ }
+
+ /*
+ * If we are here we are sure that redistribution only requires to delete data on remote
+ * nodes on the new subset of nodes. So launch to remote nodes a DELETE command that only
+ * eliminates the data not verifying the new hashing condition.
+ */
+ if (newLocInfo->locatorType == LOCATOR_TYPE_HASH)
+ {
+ ExecNodes *execNodes = makeNode(ExecNodes);
+ execNodes->nodeList = newLocInfo->nodeList;
+ distribState->commands = lappend(distribState->commands,
+ makeRedistribCommand(DISTRIB_DELETE_HASH, CATALOG_UPDATE_AFTER, execNodes));
+ }
+ else if (newLocInfo->locatorType == LOCATOR_TYPE_MODULO)
+ {
+ ExecNodes *execNodes = makeNode(ExecNodes);
+ execNodes->nodeList = newLocInfo->nodeList;
+ distribState->commands = lappend(distribState->commands,
+ makeRedistribCommand(DISTRIB_DELETE_MODULO, CATALOG_UPDATE_AFTER, execNodes));
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("Incorrect redistribution operation")));
+
+ /* Add REINDEX command if necessary */
+ pgxc_redist_add_reindex(distribState);
+}
+
+
+/*
+ * pgxc_redist_build_replicate
+ * Build redistribution command list for replicated tables
+ */
+static void
+pgxc_redist_build_replicate(RedistribState *distribState,
+ RelationLocInfo *oldLocInfo,
+ RelationLocInfo *newLocInfo)
+{
+ List *removedNodes;
+ List *newNodes;
+
+ /* If a command list has already been built, nothing to do */
+ if (list_length(distribState->commands) != 0)
+ return;
+
+ /* Case of a replicated table whose set of nodes is changed */
+ if (!IsLocatorReplicated(newLocInfo->locatorType) ||
+ !IsLocatorReplicated(oldLocInfo->locatorType))
+ return;
+
+ /* Get the list of nodes that are added to the relation */
+ removedNodes = list_difference_int(oldLocInfo->nodeList, newLocInfo->nodeList);
+
+ /* Get the list of nodes that are removed from relation */
+ newNodes = list_difference_int(newLocInfo->nodeList, oldLocInfo->nodeList);
+
+ /*
+ * If nodes have to be added, we need to fetch data for redistribution first.
+ * So add a COPY TO command to fetch data.
+ */
+ if (newNodes != NIL)
+ {
+ /* Add COPY TO command */
+ distribState->commands = lappend(distribState->commands,
+ makeRedistribCommand(DISTRIB_COPY_TO, CATALOG_UPDATE_BEFORE, NULL));
+ }
+
+ /* Nodes removed have to be truncated, so add a TRUNCATE commands to removed nodes */
+ if (removedNodes != NIL)
+ {
+ ExecNodes *execNodes = makeNode(ExecNodes);
+ execNodes->nodeList = removedNodes;
+ /* Add TRUNCATE command */
+ distribState->commands = lappend(distribState->commands,
+ makeRedistribCommand(DISTRIB_TRUNCATE, CATALOG_UPDATE_BEFORE, execNodes));
+ }
+
+ /* If necessary, COPY the data obtained at first step to the new nodes. */
+ if (newNodes != NIL)
+ {
+ ExecNodes *execNodes = makeNode(ExecNodes);
+ execNodes->nodeList = newNodes;
+ /* Add COPY FROM command */
+ distribState->commands = lappend(distribState->commands,
+ makeRedistribCommand(DISTRIB_COPY_FROM, CATALOG_UPDATE_AFTER, execNodes));
+ }
+
+ /* Add REINDEX command if necessary */
+ pgxc_redist_add_reindex(distribState);
+}
+
+
+/*
+ * pgxc_redist_build_default
+ * Build a default list consisting of
+ * COPY TO -> TRUNCATE -> COPY FROM ( -> REINDEX )
+ */
+static void
+pgxc_redist_build_default(RedistribState *distribState)
+{
+ /* If a command list has already been built, nothing to do */
+ if (list_length(distribState->commands) != 0)
+ return;
+
+ /* COPY TO command */
+ distribState->commands = lappend(distribState->commands,
+ makeRedistribCommand(DISTRIB_COPY_TO, CATALOG_UPDATE_BEFORE, NULL));
+ /* TRUNCATE command */
+ distribState->commands = lappend(distribState->commands,
+ makeRedistribCommand(DISTRIB_TRUNCATE, CATALOG_UPDATE_BEFORE, NULL));
+ /* COPY FROM command */
+ distribState->commands = lappend(distribState->commands,
+ makeRedistribCommand(DISTRIB_COPY_FROM, CATALOG_UPDATE_AFTER, NULL));
+
+ /* REINDEX command */
+ pgxc_redist_add_reindex(distribState);
+}
+
+
+/*
+ * pgxc_redist_build_reindex
+ * Add a reindex command if necessary
+ */
+static void
+pgxc_redist_add_reindex(RedistribState *distribState)
+{
+ Relation rel;
+
+ rel = relation_open(distribState->relid, NoLock);
+
+ /* Build REINDEX command if necessary */
+ if (RelationGetIndexList(rel) != NIL)
+ {
+ distribState->commands = lappend(distribState->commands,
+ makeRedistribCommand(DISTRIB_REINDEX, CATALOG_UPDATE_AFTER, NULL));
+ }
+
+ relation_close(rel, NoLock);
+}
+
+
+/*
+ * distrib_execute_command
+ * Execute a redistribution operation
+ */
+static void
+distrib_execute_command(RedistribState *distribState, RedistribCommand *command)
+{
+ /* Execute redistribution command */
+ switch (command->type)
+ {
+ case DISTRIB_COPY_TO:
+ distrib_copy_to(distribState);
+ break;
+ case DISTRIB_COPY_FROM:
+ distrib_copy_from(distribState, command->execNodes);
+ break;
+ case DISTRIB_TRUNCATE:
+ distrib_truncate(distribState, command->execNodes);
+ break;
+ case DISTRIB_REINDEX:
+ distrib_reindex(distribState, command->execNodes);
+ break;
+ case DISTRIB_DELETE_HASH:
+ case DISTRIB_DELETE_MODULO:
+ distrib_delete_hash(distribState, command->execNodes);
+ break;
+ case DISTRIB_NONE:
+ default:
+ Assert(0); /* Should not happen */
+ }
+}
+
+
+/*
+ * distrib_copy_to
+ * Copy all the data of table to be distributed.
+ * This data is saved in a tuplestore saved in distribution state.
+ * a COPY FROM operation is always done on nodes determined by the locator data
+ * in catalogs, explaining why this cannot be done on a subset of nodes. It also
+ * insures that no read operations are done on nodes where data is not yet located.
+ */
+static void
+distrib_copy_to(RedistribState *distribState)
+{
+ Oid relOid = distribState->relid;
+ Relation rel;
+ RemoteCopyOptions *options;
+ RemoteCopyData *copyState;
+ Tuplestorestate *store; /* Storage of redistributed data */
+
+ /* Fetch necessary data to prepare for the table data acquisition */
+ options = makeRemoteCopyOptions();
+
+ /* All the fields are separated by tabs in redistribution */
+ options->rco_delim = palloc(2);
+ options->rco_delim[0] = COPYOPS_DELIMITER;
+ options->rco_delim[1] = '\0';
+
+ copyState = (RemoteCopyData *) palloc0(sizeof(RemoteCopyData));
+ copyState->is_from = false;
+
+ /* A sufficient lock level needs to be taken at a higher level */
+ rel = relation_open(relOid, NoLock);
+ RemoteCopy_GetRelationLoc(copyState, rel, NIL);
+ RemoteCopy_BuildStatement(copyState, rel, options, NIL, NIL);
+
+ /* Inform client of operation being done */
+ ereport(DEBUG1,
+ (errmsg("Copying data for relation \"%s.%s\"",
+ get_namespace_name(RelationGetNamespace(rel)),
+ RelationGetRelationName(rel))));
+
+ /* Begin the COPY process */
+ copyState->connections = DataNodeCopyBegin(copyState->query_buf.data,
+ copyState->exec_nodes->nodeList,
+ GetActiveSnapshot());
+
+ /* Create tuplestore storage */
+ store = tuplestore_begin_heap(true, false, work_mem);
+
+ /* Then get rows and copy them to the tuplestore used for redistribution */
+ DataNodeCopyOut(copyState->exec_nodes,
+ copyState->connections,
+ RelationGetDescr(rel), /* Need also to set up the tuple descriptor */
+ NULL,
+ store, /* Tuplestore used for redistribution */
+ REMOTE_COPY_TUPLESTORE);
+
+ /* Do necessary clean-up */
+ FreeRemoteCopyOptions(options);
+
+ /* Lock is maintained until transaction commits */
+ relation_close(rel, NoLock);
+
+ /* Save results */
+ distribState->store = store;
+}
+
+
+/*
+ * PGXCDistribTableCopyFrom
+ * Execute commands related to COPY FROM
+ * Redistribute all the data of table with a COPY FROM from given tuplestore.
+ */
+static void
+distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes)
+{
+ Oid relOid = distribState->relid;
+ Tuplestorestate *store = distribState->store;
+ Relation rel;
+ RemoteCopyOptions *options;
+ RemoteCopyData *copyState;
+ bool replicated, contains_tuple = true;
+ TupleDesc tupdesc;
+
+ /* Nothing to do if on remote node */
+ if (IS_PGXC_DATANODE || IsConnFromCoord())
+ return;
+
+ /* Fetch necessary data to prepare for the table data acquisition */
+ options = makeRemoteCopyOptions();
+ /* All the fields are separated by tabs in redistribution */
+ options->rco_delim = palloc(2);
+ options->rco_delim[0] = COPYOPS_DELIMITER;
+ options->rco_delim[1] = '\0';
+
+ copyState = (RemoteCopyData *) palloc0(sizeof(RemoteCopyData));
+ copyState->is_from = true;
+
+ /* A sufficient lock level needs to be taken at a higher level */
+ rel = relation_open(relOid, NoLock);
+ RemoteCopy_GetRelationLoc(copyState, rel, NIL);
+ RemoteCopy_BuildStatement(copyState, rel, options, NIL, NIL);
+
+ /*
+ * When building COPY FROM command in redistribution list,
+ * use the list of nodes that has been calculated there.
+ * It might be possible that this COPY is done only on a portion of nodes.
+ */
+ if (exec_nodes && exec_nodes->nodeList != NIL)
+ {
+ copyState->exec_nodes->nodeList = exec_nodes->nodeList;
+ copyState->rel_loc->nodeList = exec_nodes->nodeList;
+ }
+
+ tupdesc = RelationGetDescr(rel);
+
+ /* Inform client of operation being done */
+ ereport(DEBUG1,
+ (errmsg("Redistributing data for relation \"%s.%s\"",
+ get_namespace_name(RelationGetNamespace(rel)),
+ RelationGetRelationName(rel))));
+
+ /* Begin redistribution on remote nodes */
+ copyState->connections = DataNodeCopyBegin(copyState->query_buf.data,
+ copyState->exec_nodes->nodeList,
+ GetActiveSnapshot());
+
+ /* Transform each tuple stored into a COPY message and send it to remote nodes */
+ while (contains_tuple)
+ {
+ char *data;
+ int len;
+ Form_pg_attribute *attr = tupdesc->attrs;
+ Datum dist_col_value = (Datum) 0;
+ bool dist_col_is_null = true;
+ Oid dist_col_type = UNKNOWNOID;
+ TupleTableSlot *slot;
+ ExecNodes *local_execnodes;
+
+ /* Build table slot for this relation */
+ slot = MakeSingleTupleTableSlot(tupdesc);
+
+ /* Get tuple slot from the tuplestore */
+ contains_tuple = tuplestore_gettupleslot(store, true, false, slot);
+ if (!contains_tuple)
+ {
+ ExecDropSingleTupleTableSlot(slot);
+ break;
+ }
+
+ /* Make sure the tuple is fully deconstructed */
+ slot_getallattrs(slot);
+
+ /* Find value of distribution column if necessary */
+ if (copyState->idx_dist_by_col >= 0)
+ {
+ dist_col_value = slot->tts_values[copyState->idx_dist_by_col];
+ dist_col_is_null = slot->tts_isnull[copyState->idx_dist_by_col];
+ dist_col_type = attr[copyState->idx_dist_by_col]->atttypid;
+ }
+
+ /* Build message to be sent to Datanodes */
+ data = CopyOps_BuildOneRowTo(tupdesc, slot->tts_values, slot->tts_isnull, &len);
+
+ /* Build relation node list */
+ local_execnodes = GetRelationNodes(copyState->rel_loc,
+ dist_col_value,
+ dist_col_is_null,
+ dist_col_type,
+ RELATION_ACCESS_INSERT);
+ /* Take a copy of the node lists so as not to interfere with locator info */
+ local_execnodes->primarynodelist = list_copy(local_execnodes->primarynodelist);
+ local_execnodes->nodeList = list_copy(local_execnodes->nodeList);
+
+ /* Process data to Datanodes */
+ DataNodeCopyIn(data,
+ len,
+ local_execnodes,
+ copyState->connections);
+
+ /* Clean up */
+ pfree(data);
+ FreeExecNodes(&local_execnodes);
+ ExecClearTuple(slot);
+ ExecDropSingleTupleTableSlot(slot);
+ }
+
+ /* Finish the redistribution process */
+ replicated = copyState->rel_loc->locatorType == LOCATOR_TYPE_REPLICATED;
+ DataNodeCopyFinish(copyState->connections,
+ replicated ? PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE) : -1,
+ replicated ? COMBINE_TYPE_SAME : COMBINE_TYPE_SUM);
+
+ /* Lock is maintained until transaction commits */
+ relation_close(rel, NoLock);
+}
+
+
+/*
+ * distrib_truncate
+ * Truncate all the data of specified table.
+ * This is used as a second step of online data redistribution.
+ */
+static void
+distrib_truncate(RedistribState *distribState, ExecNodes *exec_nodes)
+{
+ Relation rel;
+ StringInfo buf;
+ Oid relOid = distribState->relid;
+
+ /* Nothing to do if on remote node */
+ if (IS_PGXC_DATANODE || IsConnFromCoord())
+ return;
+
+ /* A sufficient lock level needs to be taken at a higher level */
+ rel = relation_open(relOid, NoLock);
+
+ /* Inform client of operation being done */
+ ereport(DEBUG1,
+ (errmsg("Truncating data for relation \"%s.%s\"",
+ get_namespace_name(RelationGetNamespace(rel)),
+ RelationGetRelationName(rel))));
+
+ /* Initialize buffer */
+ buf = makeStringInfo();
+
+ /* Build query to clean up table before redistribution */
+ appendStringInfo(buf, "TRUNCATE %s.%s",
+ get_namespace_name(RelationGetNamespace(rel)),
+ RelationGetRelationName(rel));
+
+ /*
+ * Lock is maintained until transaction commits,
+ * relation needs also to be closed before effectively launching the query.
+ */
+ relation_close(rel, NoLock);
+
+ /* Execute the query */
+ distrib_execute_query(buf->data, IsTempTable(relOid), exec_nodes);
+
+ /* Clean buffers */
+ pfree(buf->data);
+ pfree(buf);
+}
+
+
+/*
+ * distrib_reindex
+ * Reindex the table that has been redistributed
+ */
+static void
+distrib_reindex(RedistribState *distribState, ExecNodes *exec_nodes)
+{
+ Relation rel;
+ StringInfo buf;
+ Oid relOid = distribState->relid;
+
+ /* Nothing to do if on remote node */
+ if (IS_PGXC_DATANODE || IsConnFromCoord())
+ return;
+
+ /* A sufficient lock level needs to be taken at a higher level */
+ rel = relation_open(relOid, NoLock);
+
+ /* Inform client of operation being done */
+ ereport(DEBUG1,
+ (errmsg("Reindexing relation \"%s.%s\"",
+ get_namespace_name(RelationGetNamespace(rel)),
+ RelationGetRelationName(rel))));
+
+ /* Initialize buffer */
+ buf = makeStringInfo();
+
+ /* Generate the query */
+ appendStringInfo(buf, "REINDEX TABLE %s.%s",
+ get_namespace_name(RelationGetNamespace(rel)),
+ RelationGetRelationName(rel));
+
+ /* Execute the query */
+ distrib_execute_query(buf->data, IsTempTable(relOid), exec_nodes);
+
+ /* Clean buffers */
+ pfree(buf->data);
+ pfree(buf);
+
+ /* Lock is maintained until transaction commits */
+ relation_close(rel, NoLock);
+}
+
+
+/*
+ * distrib_delete_hash
+ * Perform a partial tuple deletion of remote tuples not checking the correct hash
+ * condition. The new distribution condition is set up in exec_nodes when building
+ * the command list.
+ */
+static void
+distrib_delete_hash(RedistribState *distribState, ExecNodes *exec_nodes)
+{
+ Relation rel;
+ StringInfo buf;
+ Oid relOid = distribState->relid;
+ ListCell *item;
+
+ /* Nothing to do if on remote node */
+ if (IS_PGXC_DATANODE || IsConnFromCoord())
+ return;
+
+ /* A sufficient lock level needs to be taken at a higher level */
+ rel = relation_open(relOid, NoLock);
+
+ /* Inform client of operation being done */
+ ereport(DEBUG1,
+ (errmsg("Deleting necessary tuples \"%s.%s\"",
+ get_namespace_name(RelationGetNamespace(rel)),
+ RelationGetRelationName(rel))));
+
+ /* Initialize buffer */
+ buf = makeStringInfo();
+
+ /* Build query to clean up table before redistribution */
+ appendStringInfo(buf, "DELETE FROM %s.%s",
+ get_namespace_name(RelationGetNamespace(rel)),
+ RelationGetRelationName(rel));
+
+ /*
+ * Launch the DELETE query to each node as the DELETE depends on
+ * local conditions for each node.
+ */
+ foreach(item, exec_nodes->nodeList)
+ {
+ StringInfo buf2;
+ char *hashfuncname, *colname;
+ Oid hashtype;
+ RelationLocInfo *locinfo = RelationGetLocInfo(rel);
+ int nodenum = lfirst_int(item);
+ int nodepos = 0;
+ ExecNodes *local_exec_nodes = makeNode(ExecNodes);
+ TupleDesc tupDesc = RelationGetDescr(rel);
+ Form_pg_attribute *attr = tupDesc->attrs;
+ ListCell *item2;
+
+ /* Here the query is launched to a unique node */
+ local_exec_nodes->nodeList = lappend_int(NIL, nodenum);
+
+ /* Get the hash type of relation */
+ hashtype = attr[locinfo->partAttrNum - 1]->atttypid;
+
+ /* Get function hash name */
+ hashfuncname = get_compute_hash_function(hashtype, locinfo->locatorType);
+
+ /* Get distribution column name */
+ if (locinfo->locatorType == LOCATOR_TYPE_HASH)
+ colname = GetRelationHashColumn(locinfo);
+ else if (locinfo->locatorType == LOCATOR_TYPE_MODULO)
+ colname = GetRelationModuloColumn(locinfo);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("Incorrect redistribution operation")));
+
+ /*
+ * Find the correct node position in node list of locator information.
+ * So scan the node list and fetch the position of node.
+ */
+ foreach(item2, locinfo->nodeList)
+ {
+ int loc = lfirst_int(item2);
+ if (loc == nodenum)
+ break;
+ nodepos++;
+ }
+
+ /*
+ * Then build the WHERE clause for deletion.
+ * The condition that allows to keep the tuples on remote nodes
+ * is of the type "RemoteNodeNumber != abs(hash_func(dis_col)) % NumDatanodes".
+ * the remote Datanode has no knowledge of its position in cluster so this
+ * number needs to be compiled locally on Coordinator.
+ * Taking the absolute value is necessary as hash may return a negative value.
+ * For hash distributions a condition with correct hash function is used.
+ * For modulo distribution, well we might need a hash function call but not
+ * all the time, this is determined implicitely by get_compute_hash_function.
+ */
+ buf2 = makeStringInfo();
+ if (hashfuncname)
+ appendStringInfo(buf2, "%s WHERE abs(%s(%s)) %% %d != %d",
+ buf->data, hashfuncname, colname,
+ list_length(locinfo->nodeList), nodepos);
+ else
+ appendStringInfo(buf2, "%s WHERE abs(%s) %% %d != %d", buf->data, colname,
+ list_length(locinfo->nodeList), nodepos);
+
+ /* Then launch this single query */
+ distrib_execute_query(buf2->data, IsTempTable(relOid), local_exec_nodes);
+
+ FreeExecNodes(&local_exec_nodes);
+ pfree(buf2->data);
+ pfree(buf2);
+ }
+
+ relation_close(rel, NoLock);
+
+ /* Clean buffers */
+ pfree(buf->data);
+ pfree(buf);
+}
+
+
+/*
+ * makeRedistribState
+ * Build a distribution state operator
+ */
+RedistribState *
+makeRedistribState(Oid relOid)
+{
+ RedistribState *res = (RedistribState *) palloc(sizeof(RedistribState));
+ res->relid = relOid;
+ res->commands = NIL;
+ res->store = NULL;
+ return res;
+}
+
+
+/*
+ * FreeRedistribState
+ * Free given distribution state
+ */
+void
+FreeRedistribState(RedistribState *state)
+{
+ ListCell *item;
+
+ /* Leave if nothing to do */
+ if (!state)
+ return;
+
+ foreach(item, state->commands)
+ FreeRedistribCommand((RedistribCommand *) lfirst(item));
+ if (list_length(state->commands) > 0)
+ list_free(state->commands);
+ if (state->store)
+ tuplestore_clear(state->store);
+}
+
+/*
+ * makeRedistribCommand
+ * Build a distribution command
+ */
+RedistribCommand *
+makeRedistribCommand(RedistribOperation type, RedistribCatalog updateState, ExecNodes *nodes)
+{
+ RedistribCommand *res = (RedistribCommand *) palloc0(sizeof(RedistribCommand));
+ res->type = type;
+ res->updateState = updateState;
+ res->execNodes = nodes;
+ return res;
+}
+
+/*
+ * FreeRedistribCommand
+ * Free given distribution command
+ */
+void
+FreeRedistribCommand(RedistribCommand *command)
+{
+ ExecNodes *nodes;
+ /* Leave if nothing to do */
+ if (!command)
+ return;
+ nodes = command->execNodes;
+
+ if (nodes)
+ FreeExecNodes(&nodes);
+ pfree(command);
+}
+
+/*
+ * distrib_execute_query
+ * Execute single raw query on given list of nodes
+ */
+static void
+distrib_execute_query(char *sql, bool is_temp, ExecNodes *exec_nodes)
+{
+ RemoteQuery *step = makeNode(RemoteQuery);
+ step->combine_type = COMBINE_TYPE_SAME;
+ step->exec_nodes = exec_nodes;
+ step->sql_statement = pstrdup(sql);
+ step->force_autocommit = false;
+
+ /* Redistribution operations only concern Datanodes */
+ step->exec_type = EXEC_ON_DATANODES;
+ step->is_temp = is_temp;
+ ExecRemoteUtility(step);
+ pfree(step->sql_statement);
+ pfree(step);
+
+ /* Be sure to advance the command counter after the last command */
+ CommandCounterIncrement();
+}
diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c
index 8a28486b5c..68b3e91500 100644
--- a/src/backend/pgxc/nodemgr/nodemgr.c
+++ b/src/backend/pgxc/nodemgr/nodemgr.c
@@ -4,8 +4,7 @@
* Routines to support manipulation of the pgxc_node catalog
* Support concerns CREATE/ALTER/DROP on NODE object.
*
- * Copyright (c) 1996-2010, PostgreSQL Global Development Group
- * Portions Copyright (c) 2010-2012 Postgres-XC Development Group
+ * Copyright (c) 2010-2012 Postgres-XC Development Group
*
*-------------------------------------------------------------------------
*/
diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c
index 9284d9c99f..7fcbebc30d 100644
--- a/src/backend/pgxc/pool/execRemote.c
+++ b/src/backend/pgxc/pool/execRemote.c
@@ -34,6 +34,7 @@
#include "nodes/nodes.h"
#include "nodes/nodeFuncs.h"
#include "optimizer/var.h"
+#include "pgxc/copyops.h"
#include "pgxc/nodemgr.h"
#include "pgxc/poolmgr.h"
#include "storage/ipc.h"
@@ -60,7 +61,7 @@ typedef enum RemoteXactNodeStatus
RXACT_NODE_NONE, /* Initial state */
RXACT_NODE_PREPARE_SENT, /* PREPARE request sent */
RXACT_NODE_PREPARE_FAILED, /* PREPARE failed on the node */
- RXACT_NODE_PREPARED, /* PREARED successfully on the node */
+ RXACT_NODE_PREPARED, /* PREPARED successfully on the node */
RXACT_NODE_COMMIT_SENT, /* COMMIT sent successfully */
RXACT_NODE_COMMIT_FAILED, /* failed to COMMIT on the node */
RXACT_NODE_COMMITTED, /* COMMITTed successfully on the node */
@@ -293,6 +294,7 @@ CreateResponseCombiner(int node_count, CombineType combine_type)
combiner->rowBuffer = NIL;
combiner->tapenodes = NULL;
combiner->initAggregates = true;
+ combiner->remoteCopyType = REMOTE_COPY_NONE;
combiner->copy_file = NULL;
combiner->rqs_cmd_id = FirstCommandId;
@@ -576,12 +578,98 @@ HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len)
/* count the row */
combiner->processed++;
- /* If there is a copy file, data has to be sent to the local file */
- if (combiner->copy_file)
- /* write data to the copy file */
- fwrite(msg_body, 1, len, combiner->copy_file);
- else
- pq_putmessage('d', msg_body, len);
+ /* Output remote COPY operation to correct location */
+ switch (combiner->remoteCopyType)
+ {
+ case REMOTE_COPY_FILE:
+ /* Write data directly to file */
+ fwrite(msg_body, 1, len, combiner->copy_file);
+ break;
+ case REMOTE_COPY_STDOUT:
+ /* Send back data to client */
+ pq_putmessage('d', msg_body, len);
+ break;
+ case REMOTE_COPY_TUPLESTORE:
+ {
+ Datum *values;
+ bool *nulls;
+ TupleDesc tupdesc = combiner->tuple_desc;
+ int i, dropped;
+ Form_pg_attribute *attr = tupdesc->attrs;
+ FmgrInfo *in_functions;
+ Oid *typioparams;
+ char **fields;
+
+ values = (Datum *) palloc(tupdesc->natts * sizeof(Datum));
+ nulls = (bool *) palloc(tupdesc->natts * sizeof(bool));
+ in_functions = (FmgrInfo *) palloc(tupdesc->natts * sizeof(FmgrInfo));
+ typioparams = (Oid *) palloc(tupdesc->natts * sizeof(Oid));
+
+ /* Calculate the Oids of input functions */
+ for (i = 0; i < tupdesc->natts; i++)
+ {
+ Oid in_func_oid;
+
+ /* Do not need any information for dropped attributes */
+ if (attr[i]->attisdropped)
+ continue;
+
+ getTypeInputInfo(attr[i]->atttypid,
+ &in_func_oid, &typioparams[i]);
+ fmgr_info(in_func_oid, &in_functions[i]);
+ }
+
+ /*
+ * Convert message into an array of fields.
+ * Last \n is not included in converted message.
+ */
+ fields = CopyOps_RawDataToArrayField(tupdesc, msg_body, len - 1);
+
+ /* Fill in the array values */
+ dropped = 0;
+ for (i = 0; i < tupdesc->natts; i++)
+ {
+ char *string = fields[i - dropped];
+ /* Do not need any information for dropped attributes */
+ if (attr[i]->attisdropped)
+ {
+ dropped++;
+ nulls[i] = true; /* Consider dropped parameter as NULL */
+ continue;
+ }
+
+ /* Find value */
+ values[i] = InputFunctionCall(&in_functions[i],
+ string,
+ typioparams[i],
+ attr[i]->atttypmod);
+ /* Setup value with NULL flag if necessary */
+ if (string == NULL)
+ nulls[i] = true;
+ else
+ nulls[i] = false;
+ }
+
+ /* Then insert the values into tuplestore */
+ tuplestore_putvalues(combiner->tuplestorestate,
+ combiner->tuple_desc,
+ values,
+ nulls);
+
+ /* Clean up everything */
+ if (*fields)
+ pfree(*fields);
+ pfree(fields);
+ pfree(values);
+ pfree(nulls);
+ pfree(in_functions);
+ pfree(typioparams);
+ }
+ break;
+ case REMOTE_COPY_NONE:
+ default:
+ Assert(0); /* Should not happen */
+ }
}
/*
@@ -852,7 +940,15 @@ CloseCombiner(RemoteQueryState *combiner)
if (combiner->connections)
pfree(combiner->connections);
if (combiner->tuple_desc)
- FreeTupleDesc(combiner->tuple_desc);
+ {
+ /*
+ * In the case of a remote COPY with tuplestore, combiner is not
+ * responsible from freeing the tuple store. This is done at an upper
+ * level once data redistribution is completed.
+ */
+ if (combiner->remoteCopyType != REMOTE_COPY_TUPLESTORE)
+ FreeTupleDesc(combiner->tuple_desc);
+ }
if (combiner->errorMessage)
pfree(combiner->errorMessage);
if (combiner->errorDetail)
@@ -2343,7 +2439,12 @@ DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle**
}
uint64
-DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE* copy_file)
+DataNodeCopyOut(ExecNodes *exec_nodes,
+ PGXCNodeHandle** copy_connections,
+ TupleDesc tupleDesc,
+ FILE* copy_file,
+ Tuplestorestate *store,
+ RemoteCopyType remoteCopyType)
{
RemoteQueryState *combiner;
int conn_count = list_length(exec_nodes->nodeList) == 0 ? NumDataNodes : list_length(exec_nodes->nodeList);
@@ -2352,9 +2453,19 @@ DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE*
combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_SUM);
combiner->processed = 0;
- /* If there is an existing file where to copy data, pass it to combiner */
- if (copy_file)
+ combiner->remoteCopyType = remoteCopyType;
+
+ /*
+ * If there is an existing file where to copy data,
+ * pass it to combiner when remote COPY output is sent back to file.
+ */
+ if (copy_file && remoteCopyType == REMOTE_COPY_FILE)
combiner->copy_file = copy_file;
+ if (store && remoteCopyType == REMOTE_COPY_TUPLESTORE)
+ {
+ combiner->tuplestorestate = store;
+ combiner->tuple_desc = tupleDesc;
+ }
foreach(nodeitem, exec_nodes->nodeList)
{
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index bc7006dfb3..777a9369aa 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -358,6 +358,7 @@ extern void hash_desc(StringInfo buf, uint8 xl_info, char *rec);
#ifdef PGXC
extern Datum compute_hash(Oid type, Datum value, char locator);
+extern char *get_compute_hash_function(Oid type, char locator);
#endif
#endif /* HASH_H */
diff --git a/src/include/catalog/pgxc_class.h b/src/include/catalog/pgxc_class.h
index 5a0cd597d3..cb540bf584 100644
--- a/src/include/catalog/pgxc_class.h
+++ b/src/include/catalog/pgxc_class.h
@@ -22,22 +22,37 @@ CATALOG(pgxc_class,9001) BKI_WITHOUT_OIDS
typedef FormData_pgxc_class *Form_pgxc_class;
-#define Natts_pgxc_class 6
+#define Natts_pgxc_class 6
-#define Anum_pgxc_class_pcrelid 1
+#define Anum_pgxc_class_pcrelid 1
#define Anum_pgxc_class_pclocatortype 2
-#define Anum_pgxc_class_pcattnum 3
+#define Anum_pgxc_class_pcattnum 3
#define Anum_pgxc_class_pchashalgorithm 4
#define Anum_pgxc_class_pchashbuckets 5
-#define Anum_pgxc_class_nodes 6
+#define Anum_pgxc_class_nodes 6
+
+typedef enum PgxcClassAlterType
+{
+ PGXC_CLASS_ALTER_DISTRIBUTION,
+ PGXC_CLASS_ALTER_NODES,
+ PGXC_CLASS_ALTER_ALL
+} PgxcClassAlterType;
extern void PgxcClassCreate(Oid pcrelid,
- char pclocatortype,
+ char pclocatortype,
int pcattnum,
int pchashalgorithm,
int pchashbuckets,
int numnodes,
Oid *nodes);
+extern void PgxcClassAlter(Oid pcrelid,
+ char pclocatortype,
+ int pcattnum,
+ int pchashalgorithm,
+ int pchashbuckets,
+ int numnodes,
+ Oid *nodes,
+ PgxcClassAlterType type);
extern void RemovePgxcClass(Oid pcrelid);
#endif /* PGXC_CLASS_H */
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index e8f2317c1b..8a837b39d5 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -1244,6 +1244,12 @@ typedef enum AlterTableType
AT_DropInherit, /* NO INHERIT parent */
AT_AddOf, /* OF <type_name> */
AT_DropOf, /* NOT OF */
+#ifdef PGXC
+ AT_DistributeBy, /* DISTRIBUTE BY ... */
+ AT_SubCluster, /* TO [ NODE nodelist | GROUP groupname ] */
+ AT_AddNodeList, /* ADD NODE nodelist */
+ AT_DeleteNodeList, /* DELETE NODE nodelist */
+#endif
AT_GenericOptions /* OPTIONS (...) */
} AlterTableType;
diff --git a/src/include/pgxc/copyops.h b/src/include/pgxc/copyops.h
new file mode 100644
index 0000000000..862dbbd299
--- /dev/null
+++ b/src/include/pgxc/copyops.h
@@ -0,0 +1,27 @@
+/*--------------------------------------------------------------------------
+ *
+ * copyops.h
+ * Routines for manipulation of remote COPY data
+ *
+ *
+ * Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/include/pgxc/copyops.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef COPYOPS_H
+#define COPYOPS_H
+
+#include "access/tupdesc.h"
+
+/* Type of data delimiter used for data redistribution using remote COPY */
+#define COPYOPS_DELIMITER '\t'
+
+extern char **CopyOps_RawDataToArrayField(TupleDesc tupdesc, char *message, int len);
+extern char *CopyOps_BuildOneRowTo(TupleDesc tupdesc, Datum *values, bool *nulls, int *len);
+
+#endif
diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h
index 32a88ecca4..5e26850d1c 100644
--- a/src/include/pgxc/execRemote.h
+++ b/src/include/pgxc/execRemote.h
@@ -48,6 +48,17 @@ typedef enum
REQUEST_TYPE_COPY_OUT /* Copy Out response */
} RequestType;
+/*
+ * Type of requests associated to a remote COPY OUT
+ */
+typedef enum
+{
+ REMOTE_COPY_NONE, /* Not defined yet */
+ REMOTE_COPY_STDOUT, /* Send back to client */
+ REMOTE_COPY_FILE, /* Write in file */
+ REMOTE_COPY_TUPLESTORE /* Store data in tuplestore */
+} RemoteCopyType;
+
/* Combines results of INSERT statements using multiple values */
typedef struct CombineTag
{
@@ -107,7 +118,8 @@ typedef struct RemoteQueryState
/* Simple DISTINCT support */
FmgrInfo *eqfunctions; /* functions to compare tuples */
MemoryContext tmp_ctx; /* separate context is needed to compare tuples */
- FILE *copy_file; /* used if copy_dest == COPY_FILE */
+ RemoteCopyType remoteCopyType; /* Type of remote COPY operation */
+ FILE *copy_file; /* used if remoteCopyType == REMOTE_COPY_FILE */
uint64 processed; /* count of data rows when running CopyOut */
/* cursor support */
char *cursor; /* cursor name */
@@ -136,7 +148,8 @@ extern void PGXCNodeCommitPrepared(char *gid);
/* Copy command just involves Datanodes */
extern PGXCNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot);
extern int DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections);
-extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE* copy_file);
+extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, TupleDesc tupleDesc,
+ FILE* copy_file, Tuplestorestate *store, RemoteCopyType remoteCopyType);
extern void DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, CombineType combine_type);
extern bool DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error);
extern int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_connections);
diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h
index bd719911ea..78ce3cff00 100644
--- a/src/include/pgxc/locator.h
+++ b/src/include/pgxc/locator.h
@@ -99,6 +99,7 @@ extern RelationLocInfo *GetRelationLocInfo(Oid relid);
extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo *src_info);
extern char GetRelationLocType(Oid relid);
extern bool IsTableDistOnPrimary(RelationLocInfo *rel_loc_info);
+extern bool IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2);
extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol,
bool isValueNull, Oid typeOfValueForDistCol,
RelationAccessType accessType);
diff --git a/src/include/pgxc/redistrib.h b/src/include/pgxc/redistrib.h
new file mode 100644
index 0000000000..ee94523dbb
--- /dev/null
+++ b/src/include/pgxc/redistrib.h
@@ -0,0 +1,80 @@
+/*-------------------------------------------------------------------------
+ *
+ * redistrib.h
+ * Routines related to online data redistribution
+ *
+ * Copyright (c) 2010-2012 Postgres-XC Development Group
+ *
+ *
+ * IDENTIFICATION
+ * src/include/pgxc/redistrib.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef REDISTRIB_H
+#define REDISTRIB_H
+
+#include "nodes/parsenodes.h"
+#include "utils/tuplestore.h"
+
+/*
+ * Type of data redistribution operations.
+ * Online data redistribution is made of one or more of those operations.
+ */
+typedef enum RedistribOperation {
+ DISTRIB_NONE, /* Default operation */
+ DISTRIB_DELETE_HASH, /* Perform a DELETE with hash value check */
+ DISTRIB_DELETE_MODULO, /* Perform a DELETE with modulo value check */
+ DISTRIB_COPY_TO, /* Perform a COPY TO */
+ DISTRIB_COPY_FROM, /* Perform a COPY FROM */
+ DISTRIB_TRUNCATE, /* Truncate relation */
+ DISTRIB_REINDEX /* Reindex relation */
+} RedistribOperation;
+
+/*
+ * Determine if operation can be done before or after
+ * catalog update on local node.
+ */
+typedef enum RedistribCatalog {
+ CATALOG_UPDATE_NONE, /* Default state */
+ CATALOG_UPDATE_AFTER, /* After catalog update */
+ CATALOG_UPDATE_BEFORE, /* Before catalog update */
+ CATALOG_UPDATE_BOTH /* Before and after catalog update */
+} RedistribCatalog;
+
+/*
+ * Redistribution command
+ * This contains the tools necessary to perform a redistribution operation.
+ */
+typedef struct RedistribCommand {
+ RedistribOperation type; /* Operation type */
+ ExecNodes *execNodes; /* List of nodes where to perform operation */
+ RedistribCatalog updateState; /* Flag to determine if operation can be done
+ * before or after catalog update */
+} RedistribCommand;
+
+/*
+ * Redistribution operation state
+ * Maintainer of redistribution state having the list of commands
+ * to be performed during redistribution.
+ * For the list of commands, we use an array and not a simple list as operations
+ * might need to be done in a certain order.
+ */
+typedef struct RedistribState {
+ Oid relid; /* Oid of relation redistributed */
+ List *commands; /* List of commands */
+ Tuplestorestate *store; /* Tuple store used for temporary data storage */
+} RedistribState;
+
+extern void PGXCRedistribTable(RedistribState *distribState, RedistribCatalog type);
+extern void PGXCRedistribCreateCommandList(RedistribState *distribState,
+ RelationLocInfo *newLocInfo);
+extern RedistribCommand *makeRedistribCommand(RedistribOperation type,
+ RedistribCatalog updateState,
+ ExecNodes *nodes);
+extern RedistribState *makeRedistribState(Oid relOid);
+extern void FreeRedistribState(RedistribState *state);
+extern void FreeRedistribCommand(RedistribCommand *command);
+
+#endif /* REDISTRIB_H */
diff --git a/src/include/pgxc/remotecopy.h b/src/include/pgxc/remotecopy.h
index 77134e71f9..93368c0ada 100644
--- a/src/include/pgxc/remotecopy.h
+++ b/src/include/pgxc/remotecopy.h
@@ -70,6 +70,7 @@ extern void RemoteCopy_BuildStatement(RemoteCopyData *state,
extern void RemoteCopy_GetRelationLoc(RemoteCopyData *state,
Relation rel,
List *attnums);
+extern RemoteCopyOptions *makeRemoteCopyOptions(void);
extern void FreeRemoteCopyData(RemoteCopyData *state);
extern void FreeRemoteCopyOptions(RemoteCopyOptions *options);
#endif
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index fde1467185..4eaabe6592 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -365,6 +365,14 @@ typedef struct StdRdOptions
#define RelationUsesTempNamespace(relation) \
((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP)
+#ifdef PGXC
+/*
+ * RelationGetLocInfo
+ * Return the location info of relation
+ */
+#define RelationGetLocInfo(relation) ((relation)->rd_locator_info)
+#endif
+
/*
* RELATION_IS_LOCAL
* If a rel is either temp or newly created in the current transaction,
diff --git a/src/test/regress/expected/xc_alter_table.out b/src/test/regress/expected/xc_alter_table.out
index a798e2f8a8..ca50a710bc 100644
--- a/src/test/regress/expected/xc_alter_table.out
+++ b/src/test/regress/expected/xc_alter_table.out
@@ -211,3 +211,411 @@ SELECT a, a2, b, c FROM xc_alter_table_2 ORDER BY b;
(5 rows)
DROP TABLE xc_alter_table_2;
+-- Tests for ALTER TABLE redistribution
+-- In the following test, a table is redistributed in all the ways possible
+-- and effects of redistribution is checked on all the dependent objects
+-- Table with integers
+CREATE TABLE xc_alter_table_3 (a int, b varchar(10)) DISTRIBUTE BY HASH(a);
+INSERT INTO xc_alter_table_3 VALUES (0, NULL);
+INSERT INTO xc_alter_table_3 VALUES (1, 'a');
+INSERT INTO xc_alter_table_3 VALUES (2, 'aa');
+INSERT INTO xc_alter_table_3 VALUES (3, 'aaa');
+INSERT INTO xc_alter_table_3 VALUES (4, 'aaaa');
+INSERT INTO xc_alter_table_3 VALUES (5, 'aaaaa');
+INSERT INTO xc_alter_table_3 VALUES (6, 'aaaaaa');
+INSERT INTO xc_alter_table_3 VALUES (7, 'aaaaaaa');
+INSERT INTO xc_alter_table_3 VALUES (8, 'aaaaaaaa');
+INSERT INTO xc_alter_table_3 VALUES (9, 'aaaaaaaaa');
+INSERT INTO xc_alter_table_3 VALUES (10, 'aaaaaaaaaa');
+-- Create some objects to check the effect of redistribution
+CREATE VIEW xc_alter_table_3_v AS SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3;
+CREATE RULE xc_alter_table_3_insert AS ON UPDATE TO xc_alter_table_3 WHERE OLD.a = 11 DO INSERT INTO xc_alter_table_3 VALUES (OLD.a + 1, 'nnn');
+PREPARE xc_alter_table_insert AS INSERT INTO xc_alter_table_3 VALUES ($1, $2);
+PREPARE xc_alter_table_delete AS DELETE FROM xc_alter_table_3 WHERE a = $1;
+PREPARE xc_alter_table_update AS UPDATE xc_alter_table_3 SET b = $2 WHERE a = $1;
+-- Now begin the tests
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+---
+ b
+(1 row)
+
+EXECUTE xc_alter_table_update(11, 'bb');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+----
+ bb
+(1 row)
+
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+ b
+-----
+ nnn
+(1 row)
+
+EXECUTE xc_alter_table_delete(12);
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(b);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+---
+ b
+(1 row)
+
+EXECUTE xc_alter_table_update(11, 'bb');
+ERROR: Partition column can't be updated in current version
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+---
+ b
+(1 row)
+
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+ b
+---
+(0 rows)
+
+EXECUTE xc_alter_table_delete(12);
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY ROUND ROBIN;
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+---
+ b
+(1 row)
+
+EXECUTE xc_alter_table_update(11, 'bb');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+----
+ bb
+(1 row)
+
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+ b
+-----
+ nnn
+(1 row)
+
+EXECUTE xc_alter_table_delete(12);
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY MODULO(a);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+---
+ b
+(1 row)
+
+EXECUTE xc_alter_table_update(11, 'bb');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+----
+ bb
+(1 row)
+
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+ b
+-----
+ nnn
+(1 row)
+
+EXECUTE xc_alter_table_delete(12);
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY MODULO(b);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+---
+ b
+(1 row)
+
+EXECUTE xc_alter_table_update(11, 'bb');
+ERROR: Partition column can't be updated in current version
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+---
+ b
+(1 row)
+
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+ b
+---
+(0 rows)
+
+EXECUTE xc_alter_table_delete(12);
+-- Index and redistribution
+CREATE INDEX xc_alter_table_3_index ON xc_alter_table_3(a);
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+---
+ b
+(1 row)
+
+EXECUTE xc_alter_table_update(11, 'bb');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+ b
+----
+ bb
+(1 row)
+
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+ b
+-----
+ nnn
+(1 row)
+
+EXECUTE xc_alter_table_delete(12);
+-- Add column on table
+ALTER TABLE xc_alter_table_3 ADD COLUMN c int DEFAULT 4;
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY REPLICATION;
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+-- Drop column on table
+ALTER TABLE xc_alter_table_3 DROP COLUMN b;
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+-- Remanipulate table once again and distribute on old column
+ALTER TABLE xc_alter_table_3 DROP COLUMN c;
+ALTER TABLE xc_alter_table_3 ADD COLUMN b varchar(3) default 'aaa';
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+-- Change the node list
+SELECT alter_table_change_nodes('xc_alter_table_3', '{1}', 'to', NULL);
+ alter_table_change_nodes
+--------------------------
+ t
+(1 row)
+
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+-- Add some nodes on it
+SELECT alter_table_change_nodes('xc_alter_table_3', '{2,4,5}', 'add', NULL);
+ alter_table_change_nodes
+--------------------------
+ t
+(1 row)
+
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check in tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+-- Remove some nodes on it
+SELECT alter_table_change_nodes('xc_alter_table_3', '{3}', 'add', NULL);
+ alter_table_change_nodes
+--------------------------
+ t
+(1 row)
+
+SELECT alter_table_change_nodes('xc_alter_table_3', '{2,3,5}', 'delete', NULL);
+ alter_table_change_nodes
+--------------------------
+ t
+(1 row)
+
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+-- Multiple operations with replication
+SELECT alter_table_change_nodes('xc_alter_table_3', '{1,3,4,5}', 'to', 'replication');
+ alter_table_change_nodes
+--------------------------
+ t
+(1 row)
+
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+-- Manipulate number of nodes to include and remove nodes on a replicated table
+-- On removed nodes data is deleted and on new nodes data is added
+SELECT alter_table_change_nodes('xc_alter_table_3', '{2,3,5}', 'to', NULL);
+ alter_table_change_nodes
+--------------------------
+ t
+(1 row)
+
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+-- Re-do a double operation with hash this time
+SELECT alter_table_change_nodes('xc_alter_table_3', '{2}', 'delete', 'hash(a)');
+ alter_table_change_nodes
+--------------------------
+ t
+(1 row)
+
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+SELECT * FROM xc_alter_table_3_v;
+ count | sum | avg
+-------+-----+--------------------
+ 11 | 55 | 5.0000000000000000
+(1 row)
+
+-- Error checks
+ALTER TABLE xc_alter_table_3 ADD COLUMN b int, DISTRIBUTE BY HASH(a);
+ERROR: Incompatible operation with data redistribution
+-- Clean up
+DROP TABLE xc_alter_table_3 CASCADE;
+NOTICE: drop cascades to view xc_alter_table_3_v
diff --git a/src/test/regress/expected/xc_create_function.out b/src/test/regress/expected/xc_create_function.out
index 41bbcdd256..64d7198513 100644
--- a/src/test/regress/expected/xc_create_function.out
+++ b/src/test/regress/expected/xc_create_function.out
@@ -43,6 +43,90 @@ begin
execute cr_command;
end;
$$;
+-- Add/Delete/change node list of a table
+CREATE OR REPLACE FUNCTION alter_table_change_nodes(tab_schema varchar, nodenums int[], command varchar, distribution varchar)
+RETURNS BOOLEAN LANGUAGE plpgsql as $$
+declare
+ cr_command varchar;
+ nodes varchar[];
+ nodename varchar;
+ nodenames_query varchar;
+ nodenames varchar;
+ sep varchar;
+ nodenum_new int[];
+ nodenum_res int[];
+ tmp_node int;
+ num_nodes int;
+ node int;
+ check_num boolean;
+ enforce_to boolean;
+BEGIN
+ -- Check the command type, only delete/add/to are allowed
+ IF command != 'delete' AND command != 'add' AND command != 'to' THEN
+ RETURN FALSE;
+ END IF;
+ nodenames_query := 'SELECT node_name FROM pgxc_node WHERE node_type = ''D''';
+ FOR nodename IN EXECUTE nodenames_query LOOP
+ nodes := array_append(nodes, nodename);
+ END LOOP;
+ nodenames := '(';
+ sep := '';
+ num_nodes := array_length(nodes, 1);
+ enforce_to := FALSE;
+
+ -- Adjust node array according to total number of nodes
+ FOREACH node IN ARRAY nodenums LOOP
+ tmp_node := node;
+ IF (node < 1 OR node > num_nodes) THEN
+ -- Enforce the usage of TO here, only safe method
+ enforce_to := TRUE;
+ tmp_node := node % num_nodes;
+ nodenum_new := array_append(nodenum_new, tmp_node);
+ END IF;
+ nodenum_new := array_append(nodenum_new, tmp_node);
+ END LOOP;
+ -- Eliminate duplicates
+ nodenum_res := array_append(nodenum_res, nodenum_new[1]);
+ FOREACH node IN ARRAY nodenum_new LOOP
+ check_num := TRUE;
+ FOREACH tmp_node IN ARRAY nodenum_res LOOP
+ IF (tmp_node = node) THEN
+ check_num := FALSE;
+ END IF;
+ END LOOP;
+ -- Fill in result array only if not replicated
+ IF check_num THEN
+ nodenum_res := array_append(nodenum_res, node);
+ END IF;
+ END LOOP;
+
+ -- If there is a unique Datanode in cluster, enforce the use of 'TO NODE'
+ -- This will avoid any consistency problems
+ IF (num_nodes = 1 OR enforce_to) THEN
+ command := 'TO';
+ END IF;
+
+ -- Finally build query
+ cr_command := 'ALTER TABLE ' || tab_schema || ' ' || command || ' NODE ';
+ FOREACH node IN ARRAY nodenum_res LOOP
+ IF (node > 0 AND node <= num_nodes) THEN
+ nodenames := nodenames || sep || nodes[node];
+ sep := ', ';
+ END IF;
+ END LOOP;
+ nodenames := nodenames || ')';
+ cr_command := cr_command || nodenames;
+
+ -- Add distribution if necessary
+ IF (distribution IS NOT NULL) then
+ cr_command := cr_command || ', DISTRIBUTE BY ' || distribution;
+ END IF;
+
+ -- Launch it
+ EXECUTE cr_command;
+ RETURN TRUE;
+END;
+$$;
-- A function to return data node name given a node number
CREATE OR REPLACE FUNCTION get_xc_node_name(node_num int) RETURNS varchar LANGUAGE plpgsql AS $$
DECLARE
diff --git a/src/test/regress/sql/xc_alter_table.sql b/src/test/regress/sql/xc_alter_table.sql
index bfa76fc848..5f78deba77 100644
--- a/src/test/regress/sql/xc_alter_table.sql
+++ b/src/test/regress/sql/xc_alter_table.sql
@@ -57,3 +57,136 @@ EXPLAIN (VERBOSE true, COSTS false, NODES false) UPDATE xc_alter_table_2 SET a =
UPDATE xc_alter_table_2 SET a = 200, a2 = 'CTO' WHERE b = 'John';
SELECT a, a2, b, c FROM xc_alter_table_2 ORDER BY b;
DROP TABLE xc_alter_table_2;
+
+-- Tests for ALTER TABLE redistribution
+-- In the following test, a table is redistributed in all the ways possible
+-- and effects of redistribution is checked on all the dependent objects
+-- Table with integers
+CREATE TABLE xc_alter_table_3 (a int, b varchar(10)) DISTRIBUTE BY HASH(a);
+INSERT INTO xc_alter_table_3 VALUES (0, NULL);
+INSERT INTO xc_alter_table_3 VALUES (1, 'a');
+INSERT INTO xc_alter_table_3 VALUES (2, 'aa');
+INSERT INTO xc_alter_table_3 VALUES (3, 'aaa');
+INSERT INTO xc_alter_table_3 VALUES (4, 'aaaa');
+INSERT INTO xc_alter_table_3 VALUES (5, 'aaaaa');
+INSERT INTO xc_alter_table_3 VALUES (6, 'aaaaaa');
+INSERT INTO xc_alter_table_3 VALUES (7, 'aaaaaaa');
+INSERT INTO xc_alter_table_3 VALUES (8, 'aaaaaaaa');
+INSERT INTO xc_alter_table_3 VALUES (9, 'aaaaaaaaa');
+INSERT INTO xc_alter_table_3 VALUES (10, 'aaaaaaaaaa');
+-- Create some objects to check the effect of redistribution
+CREATE VIEW xc_alter_table_3_v AS SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3;
+CREATE RULE xc_alter_table_3_insert AS ON UPDATE TO xc_alter_table_3 WHERE OLD.a = 11 DO INSERT INTO xc_alter_table_3 VALUES (OLD.a + 1, 'nnn');
+PREPARE xc_alter_table_insert AS INSERT INTO xc_alter_table_3 VALUES ($1, $2);
+PREPARE xc_alter_table_delete AS DELETE FROM xc_alter_table_3 WHERE a = $1;
+PREPARE xc_alter_table_update AS UPDATE xc_alter_table_3 SET b = $2 WHERE a = $1;
+
+-- Now begin the tests
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_update(11, 'bb');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+EXECUTE xc_alter_table_delete(12);
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(b);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_update(11, 'bb');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+EXECUTE xc_alter_table_delete(12);
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY ROUND ROBIN;
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_update(11, 'bb');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+EXECUTE xc_alter_table_delete(12);
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY MODULO(a);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_update(11, 'bb');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+EXECUTE xc_alter_table_delete(12);
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY MODULO(b);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_update(11, 'bb');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+EXECUTE xc_alter_table_delete(12);
+-- Index and redistribution
+CREATE INDEX xc_alter_table_3_index ON xc_alter_table_3(a);
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+EXECUTE xc_alter_table_insert(11, 'b');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_update(11, 'bb');
+SELECT b FROM xc_alter_table_3 WHERE a = 11;
+EXECUTE xc_alter_table_delete(11);
+SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12;
+EXECUTE xc_alter_table_delete(12);
+-- Add column on table
+ALTER TABLE xc_alter_table_3 ADD COLUMN c int DEFAULT 4;
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY REPLICATION;
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3;
+SELECT * FROM xc_alter_table_3_v;
+-- Drop column on table
+ALTER TABLE xc_alter_table_3 DROP COLUMN b;
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3;
+SELECT * FROM xc_alter_table_3_v;
+-- Remanipulate table once again and distribute on old column
+ALTER TABLE xc_alter_table_3 DROP COLUMN c;
+ALTER TABLE xc_alter_table_3 ADD COLUMN b varchar(3) default 'aaa';
+ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+-- Change the node list
+SELECT alter_table_change_nodes('xc_alter_table_3', '{1}', 'to', NULL);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+-- Add some nodes on it
+SELECT alter_table_change_nodes('xc_alter_table_3', '{2,4,5}', 'add', NULL);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check in tuple presence
+SELECT * FROM xc_alter_table_3_v;
+-- Remove some nodes on it
+SELECT alter_table_change_nodes('xc_alter_table_3', '{3}', 'add', NULL);
+SELECT alter_table_change_nodes('xc_alter_table_3', '{2,3,5}', 'delete', NULL);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+-- Multiple operations with replication
+SELECT alter_table_change_nodes('xc_alter_table_3', '{1,3,4,5}', 'to', 'replication');
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+-- Manipulate number of nodes to include and remove nodes on a replicated table
+-- On removed nodes data is deleted and on new nodes data is added
+SELECT alter_table_change_nodes('xc_alter_table_3', '{2,3,5}', 'to', NULL);
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+-- Re-do a double operation with hash this time
+SELECT alter_table_change_nodes('xc_alter_table_3', '{2}', 'delete', 'hash(a)');
+SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence
+SELECT * FROM xc_alter_table_3_v;
+-- Error checks
+ALTER TABLE xc_alter_table_3 ADD COLUMN b int, DISTRIBUTE BY HASH(a);
+-- Clean up
+DROP TABLE xc_alter_table_3 CASCADE;
diff --git a/src/test/regress/sql/xc_create_function.sql b/src/test/regress/sql/xc_create_function.sql
index 1c8e2350eb..bd7ad3c8b8 100644
--- a/src/test/regress/sql/xc_create_function.sql
+++ b/src/test/regress/sql/xc_create_function.sql
@@ -45,6 +45,91 @@ begin
end;
$$;
+-- Add/Delete/change node list of a table
+CREATE OR REPLACE FUNCTION alter_table_change_nodes(tab_schema varchar, nodenums int[], command varchar, distribution varchar)
+RETURNS BOOLEAN LANGUAGE plpgsql as $$
+declare
+ cr_command varchar;
+ nodes varchar[];
+ nodename varchar;
+ nodenames_query varchar;
+ nodenames varchar;
+ sep varchar;
+ nodenum_new int[];
+ nodenum_res int[];
+ tmp_node int;
+ num_nodes int;
+ node int;
+ check_num boolean;
+ enforce_to boolean;
+BEGIN
+ -- Check the command type, only delete/add/to are allowed
+ IF command != 'delete' AND command != 'add' AND command != 'to' THEN
+ RETURN FALSE;
+ END IF;
+ nodenames_query := 'SELECT node_name FROM pgxc_node WHERE node_type = ''D''';
+ FOR nodename IN EXECUTE nodenames_query LOOP
+ nodes := array_append(nodes, nodename);
+ END LOOP;
+ nodenames := '(';
+ sep := '';
+ num_nodes := array_length(nodes, 1);
+ enforce_to := FALSE;
+
+ -- Adjust node array according to total number of nodes
+ FOREACH node IN ARRAY nodenums LOOP
+ tmp_node := node;
+ IF (node < 1 OR node > num_nodes) THEN
+ -- Enforce the usage of TO here, only safe method
+ enforce_to := TRUE;
+ tmp_node := node % num_nodes;
+ nodenum_new := array_append(nodenum_new, tmp_node);
+ END IF;
+ nodenum_new := array_append(nodenum_new, tmp_node);
+ END LOOP;
+ -- Eliminate duplicates
+ nodenum_res := array_append(nodenum_res, nodenum_new[1]);
+ FOREACH node IN ARRAY nodenum_new LOOP
+ check_num := TRUE;
+ FOREACH tmp_node IN ARRAY nodenum_res LOOP
+ IF (tmp_node = node) THEN
+ check_num := FALSE;
+ END IF;
+ END LOOP;
+ -- Fill in result array only if not replicated
+ IF check_num THEN
+ nodenum_res := array_append(nodenum_res, node);
+ END IF;
+ END LOOP;
+
+ -- If there is a unique Datanode in cluster, enforce the use of 'TO NODE'
+ -- This will avoid any consistency problems
+ IF (num_nodes = 1 OR enforce_to) THEN
+ command := 'TO';
+ END IF;
+
+ -- Finally build query
+ cr_command := 'ALTER TABLE ' || tab_schema || ' ' || command || ' NODE ';
+ FOREACH node IN ARRAY nodenum_res LOOP
+ IF (node > 0 AND node <= num_nodes) THEN
+ nodenames := nodenames || sep || nodes[node];
+ sep := ', ';
+ END IF;
+ END LOOP;
+ nodenames := nodenames || ')';
+ cr_command := cr_command || nodenames;
+
+ -- Add distribution if necessary
+ IF (distribution IS NOT NULL) then
+ cr_command := cr_command || ', DISTRIBUTE BY ' || distribution;
+ END IF;
+
+ -- Launch it
+ EXECUTE cr_command;
+ RETURN TRUE;
+END;
+$$;
+
-- A function to return data node name given a node number
CREATE OR REPLACE FUNCTION get_xc_node_name(node_num int) RETURNS varchar LANGUAGE plpgsql AS $$
DECLARE