diff options
29 files changed, 3416 insertions, 58 deletions
diff --git a/doc-xc/src/sgml/ref/alter_table.sgmlin b/doc-xc/src/sgml/ref/alter_table.sgmlin index 3a1f095e15..9116c8313e 100644 --- a/doc-xc/src/sgml/ref/alter_table.sgmlin +++ b/doc-xc/src/sgml/ref/alter_table.sgmlin @@ -67,6 +67,10 @@ ALTER TABLE <replaceable class="PARAMETER">name</replaceable> NOT OF OWNER TO <replaceable class="PARAMETER">new_owner</replaceable> SET TABLESPACE <replaceable class="PARAMETER">new_tablespace</replaceable> + DISTRIBUTE BY { REPLICATION | ROUND ROBIN | { [HASH | MODULO ] ( <replaceable class="PARAMETER">column_name</replaceable> ) } } + TO { GROUP <replaceable class="PARAMETER">groupname</replaceable> | NODE ( <replaceable class="PARAMETER">nodename</replaceable> [, ... ] ) } + ADD NODE ( <replaceable class="PARAMETER">nodename</replaceable> [, ... ] ) + DELETE NODE ( <replaceable class="PARAMETER">nodename</replaceable> [, ... ] ) <phrase>and <replaceable class="PARAMETER">table_constraint_using_index</replaceable> is:</phrase> @@ -573,6 +577,111 @@ ALTER TABLE <replaceable class="PARAMETER">name</replaceable> </listitem> </varlistentry> +<!## XC> + <varlistentry> + <term><literal>DISTRIBUTE BY</literal></term> + <listitem> +&xconly; + <para> + This clause specifies how the table is distributed or replicated among Datanodes. + </para> + + <variablelist> + + <varlistentry> + <term><literal>REPLICATION</literal></term> + <listitem> + <para> + Each row of the table will be replicated into all the + Datanode of the <productname>Postgres-XC</> database + cluster. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>ROUND ROBIN</literal></term> + <listitem> + <para> + Each row of the table will be placed in one of the Datanodes + by round-robin manner. The value of the row will not be + needed to determine what Datanode to go. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>HASH ( <replaceable class="PARAMETER">column_name</> )</literal></term> + <listitem> + <para> + Each row of the table will be placed based on the hash value + of the specified column. Following type is allowed as + distribution column: INT8, INT2, OID, INT4, BOOL, INT2VECTOR, + OIDVECTOR, CHAR, NAME, TEXT, BPCHAR, BYTEA, VARCHAR, FLOAT4, + FLOAT8, NUMERIC, CASH, ABSTIME, RELTIME, DATE, TIME, + TIMESTAMP, TIMESTAMPTZ, INTERVAL, and TIMETZ. + </para> + <para> + Please note that floating point is not allowed as a basis of + the distribution column. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>MODULO ( <replaceable class="PARAMETER">column_name</> )</literal></term> + <listitem> + <para> + Each row of the table will be placed based on the modulo + of the specified column. Following type is allowed as + distribution column: INT8, INT2, OID, INT4, BOOL, INT2VECTOR, + OIDVECTOR, CHAR, NAME, TEXT, BPCHAR, BYTEA, VARCHAR, FLOAT4, + FLOAT8, NUMERIC, CASH, ABSTIME, RELTIME, DATE, TIME, + TIMESTAMP, TIMESTAMPTZ, INTERVAL, and TIMETZ. + </para> + <para> + Please note that floating point is not allowed as a basis of + the distribution column. + </para> + </listitem> + </varlistentry> + </variablelist> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>TO GROUP</literal></term> + <term><literal>TO NODE</literal></term> + <listitem> + <para> + This defines the list of nodes on which table data exists. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>ADD NODE</literal></term> + <listitem> + <para> + This adds a list of nodes where data of table is distributed + to the existing list. If the list of nodes added contains nodes + already used by table, an error is returned. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>DELETE NODE</literal></term> + <listitem> + <para> + This deletes a list of nodes where data of table is distributed + to the existing list. If the list of nodes deleted contains nodes + not used by table, an error is returned. + </para> + </listitem> + </varlistentry> +<!## end> + </variablelist> </para> @@ -789,7 +898,26 @@ ALTER TABLE <replaceable class="PARAMETER">name</replaceable> </listitem> </varlistentry> +<!## XC> + <varlistentry> + <term><replaceable class="PARAMETER">nodename</replaceable></term> + <listitem> + <para> + It defines a <productname>Postgres-XC</productname> node of catalog pgxc_node. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="PARAMETER">groupname</replaceable></term> + <listitem> + <para> + It defines a <productname>Postgres-XC</productname> node group in catalog pgxc_group. + </para> + </listitem> + </varlistentry> </variablelist> +<!## end> </refsect1> <refsect1> @@ -904,10 +1032,74 @@ ALTER TABLE <replaceable class="PARAMETER">name</replaceable> <!## XC> &xconly; <para> - Please note that except for the column name, you cannot alter - attribute of table distribution as specified - with <literal>DISTRIBUTE BY</> clause in <literal>CREATE TABLE</> - statement. + <command>ALTER TABLE</> with clauses <literal>DISTRIBUTE BY</>, <literal>ADD NODE</>, + <literal>DELETE NODE</>, <literal>TO NODE</> or <literal>TO GROUP</> is used for data + redistribution among nodes specific to <productname>Postgres-XC</>. Those clauses cannot be + used with other commands. + </para> + + <para> + Multiple redistribution scenarios are possible depending on modifications done: + <variablelist> + <varlistentry> + <term>Default redistribution:</term> + <listitem> + <para> + This is the slowest scenario possible. It is done in 3 or 4 steps. Data is firstly + saved on Coordinator by fetching all the data with <command>COPY TO</> command. At + this point all the tuples are saved using tuple store. The amount of cache allowed for + tuple store operation can be controlled with <varname>work_mem</>. Then the table is + truncated on all the nodes. Then catalogs are updated. Finally data inside tuple store + is redistributed using an internal <command>COPY FROM</> mechanism. <command>REINDEX</> + is issued if necessary. The overall performance of this scenario is close to the + time necessary to run consecutively <command>COPY TO</> and <command>COPY FROM</>. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term>Redistribution from replicated to replicated table:</term> + <listitem> + <para> + The node list of a table can have new nodes as well as removed nodes. + If nodes are only removed, <command>TRUNCATE</> is launched to remote nodes that are + removed. If new nodes are added, then table data is fetch on Coordinator with <command> + COPY TO</> and stored inside a tuplestore controlled with <varname>work_mem</>, then + data stored is only sent to the new nodes using <command>COPY FROM</> with data stored + inside the tuplestore. <command>REINDEX</> is issued if necessary. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term>Redistribution from replicated to distributed table:</term> + <listitem> + <para> + If the relation node list contains new nodes, the default redistribution + mechanism is used. However, if the node list of relation after redistribution is + included in node list of relation after redistribution, as all the tuples are already + located on remote nodes, it is not necessary to fetch any data on Coordinator. Hence, + <command>DELETE</> is used to remove on remote nodes only the necessary tuples. This + query uses selects tuples to remove with conditions based on the number of nodes in node + list of relation after redistribution, the <literal>HASH</> or <literal>MODULO</> value + used for new distribution and the remote node itself where <command>DELETE</> is launched.. + <command>REINDEX</> is issued if necessary. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term>Redistribution from distributed to replicated table:</term> + <listitem> + <para> + In this case the default redistribution mechanism is used. + </para> + </listitem> + </varlistentry> + </variablelist> + <para> + + <para> </para> <!## end> </refsect1> @@ -1055,6 +1247,30 @@ ALTER TABLE distributors DROP CONSTRAINT distributors_pkey, </programlisting> </para> +<!## XC> + <para> + To change the distribution type and the list of nodes where table data + is located: +<programlisting> +ALTER TABLE distributors TO NODE (dn1, dn7), DISTRIBUTE BY HASH(dist_id); +</programlisting> + </para> + + <para> + To add a node where data of table is distributed: +<programlisting> +ALTER TABLE distributors ADD NODE (dn9, dn14); +</programlisting> + </para> + + <para> + To remove a node where data of table is distributed: +<programlisting> +ALTER TABLE distributors DELETE NODE (dn4, dn0); +</programlisting> + </para> +<!## end> + </refsect1> <refsect1> diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c index 9cb17eb4f7..f4a14e3229 100644 --- a/src/backend/access/hash/hashfunc.c +++ b/src/backend/access/hash/hashfunc.c @@ -531,8 +531,8 @@ hash_uint32(uint32 k) #ifdef PGXC /* - * compute_hash() -- Generaic hash function for all datatypes - * + * compute_hash() + * Generic hash function for all datatypes */ Datum compute_hash(Oid type, Datum value, char locator) @@ -637,4 +637,81 @@ compute_hash(Oid type, Datum value, char locator) return (Datum)0; } + +/* + * get_compute_hash_function + * Get hash function name depending on the hash type. + * For some cases of hash or modulo distribution, a function might + * be required or not. + */ +char * +get_compute_hash_function(Oid type, char locator) +{ + switch (type) + { + case INT8OID: + if (locator == LOCATOR_TYPE_HASH) + return "hashint8"; + return NULL; + case INT2OID: + if (locator == LOCATOR_TYPE_HASH) + return "hashint2"; + return NULL; + case OIDOID: + if (locator == LOCATOR_TYPE_HASH) + return "hashoid"; + return NULL; + case DATEOID: + case INT4OID: + if (locator == LOCATOR_TYPE_HASH) + return "hashint4"; + return NULL; + case BOOLOID: + if (locator == LOCATOR_TYPE_HASH) + return "hashchar"; + return NULL; + case CHAROID: + return "hashchar"; + case NAMEOID: + return "hashname"; + case INT2VECTOROID: + return "hashint2vector"; + case VARCHAROID: + case TEXTOID: + return "hashtext"; + case OIDVECTOROID: + return "hashoidvector"; + case FLOAT4OID: + return "hashfloat4"; + case FLOAT8OID: + return "hashfloat8"; + case RELTIMEOID: + case ABSTIMEOID: + if (locator == LOCATOR_TYPE_HASH) + return "hashint4"; + return NULL; + case CASHOID: + return "hashint8"; + case BPCHAROID: + return "hashbpchar"; + case BYTEAOID: + return "hashvarlena"; + case TIMEOID: + return "time_hash"; + case TIMESTAMPOID: + case TIMESTAMPTZOID: + return "timestamp_hash"; + case INTERVALOID: + return "interval_hash"; + case TIMETZOID: + return "timetz_hash"; + case NUMERICOID: + return "hash_numeric"; + default: + ereport(ERROR,(errmsg("Unhandled datatype for modulo or hash distribution\n"))); + } + + /* Keep compiler quiet */ + return NULL; +} #endif diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 18248f4193..f797a0b75f 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -937,13 +937,13 @@ cmp_nodes(const void *p1, const void *p2) } /* -------------------------------- - * AddRelationDistribution + * AddRelationDistribution * * Add to pgxc_class table * -------------------------------- */ -void -AddRelationDistribution(Oid relid, +void +AddRelationDistribution(Oid relid, DistributeBy *distributeby, PGXCSubCluster *subcluster, List *parentOids, @@ -1007,7 +1007,7 @@ GetRelationDistributionItems(Oid relid, if (!distributeby) { - /* + /* * If no distribution was specified, and we have not chosen * one based on primary key or foreign key, use first column with * a supported data type. @@ -1032,9 +1032,9 @@ GetRelationDistributionItems(Oid relid, if (local_attnum == 0) local_locatortype = LOCATOR_TYPE_RROBIN; } - else + else { - /* + /* * User specified distribution type */ switch (distributeby->disttype) @@ -1051,12 +1051,12 @@ GetRelationDistributionItems(Oid relid, (errcode(ERRCODE_INVALID_TABLE_DEFINITION), errmsg("Invalid distribution column specified"))); } - + if (!IsTypeHashDistributable(descriptor->attrs[local_attnum - 1]->atttypid)) { ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("Column %s is not a hash distributable data type", + errmsg("Column %s is not a hash distributable data type", distributeby->colname))); } local_locatortype = LOCATOR_TYPE_HASH; @@ -1108,10 +1108,14 @@ GetRelationDistributionItems(Oid relid, } /* Save results */ - *attnum = local_attnum; - *hashalgorithm = local_hashalgorithm; - *hashbuckets = local_hashbuckets; - *locatortype = local_locatortype; + if (attnum) + *attnum = local_attnum; + if (hashalgorithm) + *hashalgorithm = local_hashalgorithm; + if (hashbuckets) + *hashbuckets = local_hashbuckets; + if (locatortype) + *locatortype = local_locatortype; } diff --git a/src/backend/catalog/pgxc_class.c b/src/backend/catalog/pgxc_class.c index 6d1cf0ed2a..1543a45342 100644 --- a/src/backend/catalog/pgxc_class.c +++ b/src/backend/catalog/pgxc_class.c @@ -23,9 +23,13 @@ #include "pgxc/locator.h" #include "utils/array.h" +/* + * PgxcClassCreate + * Create a pgxc_class entry + */ void PgxcClassCreate(Oid pcrelid, - char pclocatortype, + char pclocatortype, int pcattnum, int pchashalgorithm, int pchashbuckets, @@ -42,7 +46,7 @@ PgxcClassCreate(Oid pcrelid, /* Build array of Oids to be inserted */ nodes_array = buildoidvector(nodes, numnodes); - /* Iterate through edb_linkauth attributes initializing nulls and values */ + /* Iterate through attributes initializing nulls and values */ for (i = 0; i < Natts_pgxc_class; i++) { nulls[i] = false; @@ -81,6 +85,102 @@ PgxcClassCreate(Oid pcrelid, heap_close(pgxcclassrel, RowExclusiveLock); } + +/* + * PgxcClassAlter + * Modify a pgxc_class entry with given data + */ +void +PgxcClassAlter(Oid pcrelid, + char pclocatortype, + int pcattnum, + int pchashalgorithm, + int pchashbuckets, + int numnodes, + Oid *nodes, + PgxcClassAlterType type) +{ + Relation rel; + HeapTuple oldtup, newtup; + oidvector *nodes_array; + Datum new_record[Natts_pgxc_class]; + bool new_record_nulls[Natts_pgxc_class]; + bool new_record_repl[Natts_pgxc_class]; + + Assert(OidIsValid(pcrelid)); + + rel = heap_open(PgxcClassRelationId, RowExclusiveLock); + oldtup = SearchSysCacheCopy1(PGXCCLASSRELID, + ObjectIdGetDatum(pcrelid)); + + if (!HeapTupleIsValid(oldtup)) /* should not happen */ + elog(ERROR, "cache lookup failed for pgxc_class %u", pcrelid); + + /* Build array of Oids to be inserted */ + nodes_array = buildoidvector(nodes, numnodes); + + /* Initialize fields */ + MemSet(new_record, 0, sizeof(new_record)); + MemSet(new_record_nulls, false, sizeof(new_record_nulls)); + MemSet(new_record_repl, false, sizeof(new_record_repl)); + + /* Fields are updated depending on operation type */ + switch (type) + { + case PGXC_CLASS_ALTER_DISTRIBUTION: + new_record_repl[Anum_pgxc_class_pclocatortype - 1] = true; + new_record_repl[Anum_pgxc_class_pcattnum - 1] = true; + new_record_repl[Anum_pgxc_class_pchashalgorithm - 1] = true; + new_record_repl[Anum_pgxc_class_pchashbuckets - 1] = true; + break; + case PGXC_CLASS_ALTER_NODES: + new_record_repl[Anum_pgxc_class_nodes - 1] = true; + break; + case PGXC_CLASS_ALTER_ALL: + default: + new_record_repl[Anum_pgxc_class_pcrelid - 1] = true; + new_record_repl[Anum_pgxc_class_pclocatortype - 1] = true; + new_record_repl[Anum_pgxc_class_pcattnum - 1] = true; + new_record_repl[Anum_pgxc_class_pchashalgorithm - 1] = true; + new_record_repl[Anum_pgxc_class_pchashbuckets - 1] = true; + new_record_repl[Anum_pgxc_class_nodes - 1] = true; + } + + /* Set up new fields */ + /* Relation Oid */ + if (new_record_repl[Anum_pgxc_class_pcrelid - 1]) + new_record[Anum_pgxc_class_pcrelid - 1] = ObjectIdGetDatum(pcrelid); + + /* Locator type */ + if (new_record_repl[Anum_pgxc_class_pclocatortype - 1]) + new_record[Anum_pgxc_class_pclocatortype - 1] = CharGetDatum(pclocatortype); + + /* Attribute number of distribution column */ + if (new_record_repl[Anum_pgxc_class_pcattnum - 1]) + new_record[Anum_pgxc_class_pcattnum - 1] = UInt16GetDatum(pcattnum); + + /* Hash algorithm type */ + if (new_record_repl[Anum_pgxc_class_pchashalgorithm - 1]) + new_record[Anum_pgxc_class_pchashalgorithm - 1] = UInt16GetDatum(pchashalgorithm); + + /* Hash buckets */ + if (new_record_repl[Anum_pgxc_class_pchashbuckets - 1]) + new_record[Anum_pgxc_class_pchashbuckets - 1] = UInt16GetDatum(pchashbuckets); + + /* Node information */ + if (new_record_repl[Anum_pgxc_class_nodes - 1]) + new_record[Anum_pgxc_class_nodes - 1] = PointerGetDatum(nodes_array); + + /* Update relation */ + newtup = heap_modify_tuple(oldtup, RelationGetDescr(rel), + new_record, + new_record_nulls, new_record_repl); + simple_heap_update(rel, &oldtup->t_self, newtup); + CatalogUpdateIndexes(rel, newtup); + + heap_close(rel, RowExclusiveLock); +} + /* * RemovePGXCClass(): * Remove extended PGXC information @@ -108,5 +208,3 @@ RemovePgxcClass(Oid pcrelid) heap_close(relation, RowExclusiveLock); } - - diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 074bf09b39..41e77bc39c 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -1700,15 +1700,26 @@ CopyTo(CopyState cstate) cstate->remoteCopyState->rel_loc) { RemoteCopyData *remoteCopyState = cstate->remoteCopyState; + RemoteCopyType remoteCopyType; + + /* Set up remote COPY to correct operation */ + if (cstate->copy_dest == COPY_FILE) + remoteCopyType = REMOTE_COPY_FILE; + else + remoteCopyType = REMOTE_COPY_STDOUT; /* * We don't know the value of the distribution column value, so need to * read from all nodes. Hence indicate that the value is NULL. */ - processed = DataNodeCopyOut( - GetRelationNodes(remoteCopyState->rel_loc, 0, true, UNKNOWNOID, RELATION_ACCESS_READ), - remoteCopyState->connections, - cstate->copy_file); + processed = DataNodeCopyOut(GetRelationNodes(remoteCopyState->rel_loc, 0, + true, UNKNOWNOID, + RELATION_ACCESS_READ), + remoteCopyState->connections, + NULL, + cstate->copy_file, + NULL, + remoteCopyType); } else { @@ -4289,9 +4300,8 @@ CreateCopyDestReceiver(void) static RemoteCopyOptions * GetRemoteCopyOptions(CopyState cstate) { - RemoteCopyOptions *res; + RemoteCopyOptions *res = makeRemoteCopyOptions(); Assert(cstate); - res = (RemoteCopyOptions *) palloc0(sizeof(RemoteCopyOptions)); /* Then fill in structure */ res->rco_binary = cstate->binary; diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index b3aaa88541..2cf1ec71b2 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -89,8 +89,11 @@ #ifdef PGXC #include "pgxc/pgxc.h" #include "access/gtm.h" +#include "catalog/pgxc_class.h" +#include "catalog/pgxc_node.h" #include "commands/sequence.h" #include "pgxc/execRemote.h" +#include "pgxc/redistrib.h" #endif /* @@ -139,7 +142,12 @@ static List *on_commits = NIL; #define AT_PASS_ADD_INDEX 6 /* ADD indexes */ #define AT_PASS_ADD_CONSTR 7 /* ADD constraints, defaults */ #define AT_PASS_MISC 8 /* other stuff */ +#ifdef PGXC +#define AT_PASS_DISTRIB 9 /* Redistribution pass */ +#define AT_NUM_PASSES 10 +#else #define AT_NUM_PASSES 9 +#endif typedef struct AlteredTableInfo { @@ -375,7 +383,14 @@ static void ATExecAddOf(Relation rel, const TypeName *ofTypename, LOCKMODE lockm static void ATExecDropOf(Relation rel, LOCKMODE lockmode); static void ATExecGenericOptions(Relation rel, List *options); #ifdef PGXC +static void AtExecDistributeBy(Relation rel, DistributeBy *options); +static void AtExecSubCluster(Relation rel, PGXCSubCluster *options); +static void AtExecAddNode(Relation rel, List *options); +static void AtExecDeleteNode(Relation rel, List *options); static void ATCheckCmd(Relation rel, AlterTableCmd *cmd); +static RedistribState *BuildRedistribCommands(Oid relid, List *subCmds); +static Oid *delete_node_list(Oid *old_oids, int old_num, Oid *del_oids, int del_num, int *new_num); +static Oid *add_node_list(Oid *old_oids, int old_num, Oid *add_oids, int add_num, int *new_num); #endif static void copy_relation_data(SMgrRelation rel, SMgrRelation dst, @@ -620,7 +635,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId) #ifdef PGXC /* * Add to pgxc_class. - * we need to do this after CommandCounterIncrement + * we need to do this after CommandCounterIncrement */ if (IS_PGXC_COORDINATOR && relkind == RELKIND_RELATION) { @@ -2509,7 +2524,17 @@ CheckTableNotInUse(Relation rel, const char *stmt) * lock level we want as we recurse may well be higher than required for * that specific subcommand. So we pass down the overall lock requirement, * rather than reassess it at lower levels. + * + */ +#ifdef PGXC +/* + * In Postgres-XC, an extension is added to ALTER TABLE for modification + * of the data distribution. Depending on the old and new distribution type + * of the relation redistributed, a list of redistribution subcommands is built. + * Data redistribution cannot be done in parallel of operations that need + * the table to be rewritten like column addition/deletion. */ +#endif void AlterTable(AlterTableStmt *stmt) { @@ -2696,6 +2721,15 @@ AlterTableGetLockLevel(List *cmds) cmd_lockmode = AccessExclusiveLock; break; +#ifdef PGXC + case AT_DistributeBy: /* Changes table distribution type */ + case AT_SubCluster: /* Changes node list of distribution */ + case AT_AddNodeList: /* Adds nodes in distribution */ + case AT_DeleteNodeList: /* Deletes nodes in distribution */ + cmd_lockmode = ExclusiveLock; + break; +#endif + /* * These subcommands affect write operations only. */ @@ -2819,6 +2853,9 @@ ATController(Relation rel, List *cmds, bool recurse, LOCKMODE lockmode) { List *wqueue = NIL; ListCell *lcmd; +#ifdef PGXC + RedistribState *redistribState = NULL; +#endif /* Phase 1: preliminary examination of commands, create work queue */ foreach(lcmd, cmds) @@ -2833,12 +2870,82 @@ ATController(Relation rel, List *cmds, bool recurse, LOCKMODE lockmode) ATPrepCmd(&wqueue, rel, cmd, recurse, false, lockmode); } +#ifdef PGXC + /* Only check that on local Coordinator */ + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + { + ListCell *ltab; + + /* + * Redistribution is only applied to the parent table and not subsequent + * children. It is also not applied in recursion. This needs to be done + * once all the commands have been treated. + */ + foreach(ltab, wqueue) + { + AlteredTableInfo *tab = (AlteredTableInfo *) lfirst(ltab); + + if (RelationGetRelid(rel) == tab->relid && + list_length(tab->subcmds[AT_PASS_DISTRIB]) > 0) + { + /* + * Check if there are any commands incompatible + * with redistribution. For the time being no other commands + * are authorized. + */ + if (list_length(tab->subcmds[AT_PASS_ADD_COL]) > 0 || + list_length(tab->subcmds[AT_PASS_DROP]) > 0 || + list_length(tab->subcmds[AT_PASS_ALTER_TYPE]) > 0 || + list_length(tab->subcmds[AT_PASS_OLD_CONSTR]) > 0 || + list_length(tab->subcmds[AT_PASS_COL_ATTRS]) > 0 || + list_length(tab->subcmds[AT_PASS_ADD_COL]) > 0 || + list_length(tab->subcmds[AT_PASS_ADD_INDEX]) > 0 || + list_length(tab->subcmds[AT_PASS_ADD_CONSTR]) > 0 || + list_length(tab->subcmds[AT_PASS_MISC]) > 0) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + errmsg("Incompatible operation with data redistribution"))); + + + /* Scan redistribution commands and improve operation */ + redistribState = BuildRedistribCommands(RelationGetRelid(rel), + tab->subcmds[AT_PASS_DISTRIB]); + break; + } + } + } +#endif + /* Close the relation, but keep lock until commit */ relation_close(rel, NoLock); +#ifdef PGXC + /* Perform pre-catalog-update redistribution operations */ + PGXCRedistribTable(redistribState, CATALOG_UPDATE_BEFORE); +#endif + /* Phase 2: update system catalogs */ ATRewriteCatalogs(&wqueue, lockmode); +#ifdef PGXC + /* Invalidate cache for redistributed relation */ + if (redistribState) + { + Relation rel2 = relation_open(redistribState->relid, NoLock); + + /* Invalidate all entries related to this relation */ + CacheInvalidateRelcache(rel2); + + /* Make sure locator info is rebuilt */ + RelationCacheInvalidateEntry(redistribState->relid); + relation_close(rel2, NoLock); + } + + /* Perform post-catalog-update redistribution operations */ + PGXCRedistribTable(redistribState, CATALOG_UPDATE_AFTER); + FreeRedistribState(redistribState); +#endif + /* Phase 3: scan/rewrite tables as needed */ ATRewriteTables(&wqueue, lockmode); } @@ -3060,6 +3167,16 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd, /* No command-specific prep needed */ pass = AT_PASS_MISC; break; +#ifdef PGXC + case AT_DistributeBy: + case AT_SubCluster: + case AT_AddNodeList: + case AT_DeleteNodeList: + ATSimplePermissions(rel, ATT_TABLE); + /* No command-specific prep needed */ + pass = AT_PASS_DISTRIB; + break; +#endif default: /* oops */ elog(ERROR, "unrecognized alter table type: %d", (int) cmd->subtype); @@ -3327,6 +3444,20 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, case AT_GenericOptions: ATExecGenericOptions(rel, (List *) cmd->def); break; +#ifdef PGXC + case AT_DistributeBy: + AtExecDistributeBy(rel, (DistributeBy *) cmd->def); + break; + case AT_SubCluster: + AtExecSubCluster(rel, (PGXCSubCluster *) cmd->def); + break; + case AT_AddNodeList: + AtExecAddNode(rel, (List *) cmd->def); + break; + case AT_DeleteNodeList: + AtExecDeleteNode(rel, (List *) cmd->def); + break; +#endif default: /* oops */ elog(ERROR, "unrecognized alter table type: %d", (int) cmd->subtype); @@ -3353,6 +3484,17 @@ ATRewriteTables(List **wqueue, LOCKMODE lockmode) { AlteredTableInfo *tab = (AlteredTableInfo *) lfirst(ltab); +#ifdef PGXC + /* Forbid table rewrite operations with online data redistribution */ + if (tab->rewrite && + list_length(tab->subcmds[AT_PASS_DISTRIB]) > 0 && + IS_PGXC_COORDINATOR && + !IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), + errmsg("Incompatible operation with data redistribution"))); +#endif + /* Foreign tables have no storage. */ if (tab->relkind == RELKIND_FOREIGN_TABLE) continue; @@ -3464,7 +3606,7 @@ ATRewriteTables(List **wqueue, LOCKMODE lockmode) } #ifdef PGXC - /* + /* * In PGXC, do not check the FK constraints on the Coordinator, and just return * That is because a SELECT is generated whose plan will try and use * the Datanodes. We (currently) do not want to do that on the Coordinator, @@ -9180,8 +9322,179 @@ ATExecGenericOptions(Relation rel, List *options) #ifdef PGXC /* + * ALTER TABLE <name> DISTRIBUTE BY ... + */ +static void +AtExecDistributeBy(Relation rel, DistributeBy *options) +{ + Oid relid; + char locatortype; + int hashalgorithm, hashbuckets; + AttrNumber attnum; + + /* Nothing to do on Datanodes */ + if (IS_PGXC_DATANODE || options == NULL) + return; + + relid = RelationGetRelid(rel); + + /* Get necessary distribution information */ + GetRelationDistributionItems(relid, + options, + RelationGetDescr(rel), + &locatortype, + &hashalgorithm, + &hashbuckets, + &attnum); + + /* + * It is not checked if the distribution type list is the same as the old one, + * user might define a different sub-cluster at the same time. + */ + + /* Update pgxc_class entry */ + PgxcClassAlter(relid, + locatortype, + (int) attnum, + hashalgorithm, + hashbuckets, + 0, + NULL, + PGXC_CLASS_ALTER_DISTRIBUTION); + + /* Make the additional catalog changes visible */ + CommandCounterIncrement(); +} + + +/* + * ALTER TABLE <name> TO [ NODE nodelist | GROUP groupname ] + */ +static void +AtExecSubCluster(Relation rel, PGXCSubCluster *options) +{ + Oid *nodeoids; + int numnodes; + + /* Nothing to do on Datanodes */ + if (IS_PGXC_DATANODE || options == NULL) + return; + + /* + * It is not checked if the new subcluster list is the same as the old one, + * user might define a different distribution type. + */ + + /* Obtain new node information */ + nodeoids = GetRelationDistributionNodes(options, &numnodes); + + /* Update pgxc_class entry */ + PgxcClassAlter(RelationGetRelid(rel), + '\0', + 0, + 0, + 0, + numnodes, + nodeoids, + PGXC_CLASS_ALTER_NODES); + + /* Make the additional catalog changes visible */ + CommandCounterIncrement(); +} + + +/* + * ALTER TABLE <name> ADD NODE nodelist + */ +static void +AtExecAddNode(Relation rel, List *options) +{ + Oid *add_oids, *old_oids; + int add_num, old_num; + + /* Nothing to do on Datanodes */ + if (IS_PGXC_DATANODE || options == NIL) + return; + + /* + * Build a new array of sorted node Oids given the list of name nodes + * to be added. + */ + add_oids = BuildRelationDistributionNodes(options, &add_num); + + /* + * Then check if nodes to be added are not in existing node + * list and build updated list of nodes. + */ + old_num = get_pgxc_classnodes(RelationGetRelid(rel), &old_oids); + + /* Add elements to array */ + old_oids = add_node_list(old_oids, old_num, add_oids, add_num, &old_num); + + /* Sort once again the newly-created array of node Oids to maintain consistency */ + old_oids = SortRelationDistributionNodes(old_oids, old_num); + + /* Update pgxc_class entry */ + PgxcClassAlter(RelationGetRelid(rel), + '\0', + 0, + 0, + 0, + old_num, + old_oids, + PGXC_CLASS_ALTER_NODES); + + /* Make the additional catalog changes visible */ + CommandCounterIncrement(); +} + + +/* + * ALTER TABLE <name> DELETE NODE nodelist + */ +static void +AtExecDeleteNode(Relation rel, List *options) +{ + Oid *del_oids, *old_oids; + int del_num, old_num; + + /* Nothing to do on Datanodes */ + if (IS_PGXC_DATANODE || options == NIL) + return; + + /* + * Build a new array of sorted node Oids given the list of name nodes + * to be deleted. + */ + del_oids = BuildRelationDistributionNodes(options, &del_num); + + /* + * Check if nodes to be deleted are really included in existing + * node list and get updated list of nodes. + */ + old_num = get_pgxc_classnodes(RelationGetRelid(rel), &old_oids); + + /* Delete elements on array */ + old_oids = delete_node_list(old_oids, old_num, del_oids, del_num, &old_num); + + /* Update pgxc_class entry */ + PgxcClassAlter(RelationGetRelid(rel), + '\0', + 0, + 0, + 0, + old_num, + old_oids, + PGXC_CLASS_ALTER_NODES); + + /* Make the additional catalog changes visible */ + CommandCounterIncrement(); +} + + +/* * ATCheckCmd - * + * * Check ALTER TABLE restrictions in Postgres-XC */ static void @@ -9205,6 +9518,218 @@ ATCheckCmd(Relation rel, AlterTableCmd *cmd) break; } } + + +/* + * BuildRedistribCommands + * Evaluate new and old distribution and build the list of operations + * necessary to perform table redistribution. + */ +static RedistribState * +BuildRedistribCommands(Oid relid, List *subCmds) +{ + RedistribState *redistribState = makeRedistribState(relid); + RelationLocInfo *oldLocInfo, *newLocInfo; /* Former locator info */ + Relation rel; + Oid *new_oid_array; /* Modified list of Oids */ + int new_num, i; /* Modified number of Oids */ + ListCell *item; + + /* Get necessary information about relation */ + rel = relation_open(redistribState->relid, NoLock); + oldLocInfo = RelationGetLocInfo(rel); + Assert(oldLocInfo); + + /* + * Get a copy of the locator information that will be modified by + * successive ALTER TABLE commands. + */ + newLocInfo = CopyRelationLocInfo(oldLocInfo); + /* The node list of this locator information will be rebuilt after command scan */ + list_free(newLocInfo->nodeList); + newLocInfo->nodeList = NULL; + + /* Get the list to be modified */ + new_num = get_pgxc_classnodes(RelationGetRelid(rel), &new_oid_array); + + foreach(item, subCmds) + { + AlterTableCmd *cmd = (AlterTableCmd *) lfirst(item); + switch (cmd->subtype) + { + case AT_DistributeBy: + /* + * Get necessary distribution information and update to new + * distribution type. + */ + GetRelationDistributionItems(redistribState->relid, + (DistributeBy *) cmd->def, + RelationGetDescr(rel), + &(newLocInfo->locatorType), + NULL, + NULL, + (AttrNumber *)&(newLocInfo->partAttrNum)); + break; + case AT_SubCluster: + /* Update new list of nodes */ + new_oid_array = GetRelationDistributionNodes((PGXCSubCluster *) cmd->def, &new_num); + break; + case AT_AddNodeList: + { + Oid *add_oids; + int add_num; + add_oids = BuildRelationDistributionNodes((List *) cmd->def, &add_num); + /* Add elements to array */ + new_oid_array = add_node_list(new_oid_array, new_num, add_oids, add_num, &new_num); + } + break; + case AT_DeleteNodeList: + { + Oid *del_oids; + int del_num; + del_oids = BuildRelationDistributionNodes((List *) cmd->def, &del_num); + /* Delete elements from array */ + new_oid_array = delete_node_list(new_oid_array, new_num, del_oids, del_num, &new_num); + } + break; + default: + Assert(0); /* Should not happen */ + } + } + + /* Build relation node list for new locator info */ + for (i = 0; i < new_num; i++) + newLocInfo->nodeList = lappend_int(newLocInfo->nodeList, + PGXCNodeGetNodeId(new_oid_array[i], + PGXC_NODE_DATANODE)); + + /* Build the command tree for table redistribution */ + PGXCRedistribCreateCommandList(redistribState, newLocInfo); + + /* Clean up */ + FreeRelationLocInfo(newLocInfo); + pfree(new_oid_array); + relation_close(rel, NoLock); + + return redistribState; +} + + +/* + * Delete from given Oid array old_oids the given oid list del_oids + * and build a new one. + */ +Oid * +delete_node_list(Oid *old_oids, int old_num, Oid *del_oids, int del_num, int *new_num) +{ + /* Allocate former array and data */ + Oid *new_oids = old_oids; + int loc_new_num = old_num; + int i; + + /* + * Delete from existing node Oid array the elements to be removed. + * An error is returned if an element to be deleted is not in existing array. + * It is not necessary to sort once again the result array of node Oids + * as here only a deletion of elements is done. + */ + for (i = 0; i < del_num; i++) + { + Oid nodeoid = del_oids[i]; + int j, position; + bool is_listed = false; + position = 0; + + for (j = 0; j < loc_new_num; j++) + { + /* Check if element can be removed */ + if (nodeoid == new_oids[j]) + { + is_listed = true; + position = j; + } + } + + /* Move all the elements from [j+1, n-1] to [j, n-2] */ + if (is_listed) + { + for (j = position + 1; j < loc_new_num; j++) + new_oids[j - 1] = new_oids[j]; + + loc_new_num--; + + /* Not possible to have an empty list */ + if (loc_new_num == 0) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("Node list is empty: one node at least is mandatory"))); + + new_oids = (Oid *) repalloc(new_oids, loc_new_num * sizeof(Oid)); + } + else + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("PGXC Node %s: object not in relation node list", + get_pgxc_nodename(nodeoid)))); + } + + /* Save new number of nodes */ + *new_num = loc_new_num; + return new_oids; +} + + +/* + * Add to given Oid array old_oids the given oid list add_oids + * and build a new one. + */ +Oid * +add_node_list(Oid *old_oids, int old_num, Oid *add_oids, int add_num, int *new_num) +{ + /* Allocate former array and data */ + Oid *new_oids = old_oids; + int loc_new_num = old_num; + int i; + + /* + * Build new Oid list, both addition and old list are already sorted. + * The idea here is to go through the list of nodes to be added and + * add the elements one-by-one on the existing list. + * An error is returned if an element to be added already exists + * in relation node array. + * Here we do O(n^2) scan to avoid a dependency with the way + * oids are sorted by heap APIs. They are sorted once again once + * the addition operation is completed. + */ + for (i = 0; i < add_num; i++) + { + Oid nodeoid = add_oids[i]; + int j; + + /* Check if element is already a part of array */ + for (j = 0; j < loc_new_num; j++) + { + /* Item is already in node list */ + if (nodeoid == new_oids[j]) + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), + errmsg("PGXC Node %s: object already in relation node list", + get_pgxc_nodename(nodeoid)))); + } + + /* If we are here, element can be added safely in node array */ + loc_new_num++; + new_oids = (Oid *) repalloc(new_oids, loc_new_num * sizeof(Oid)); + new_oids[loc_new_num - 1] = nodeoid; + } + + /* Sort once again the newly-created array of node Oids to maintain consistency */ + new_oids = SortRelationDistributionNodes(new_oids, loc_new_num); + + /* Save new number of nodes */ + *new_num = loc_new_num; + return new_oids; +} #endif diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index c87fdbf3d9..7b6050e4f4 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -2038,6 +2038,40 @@ alter_table_cmd: n->def = (Node *)$1; $$ = (Node *) n; } +/* PGXC_BEGIN */ + /* ALTER TABLE <name> DISTRIBUTE BY ... */ + | OptDistributeByInternal + { + AlterTableCmd *n = makeNode(AlterTableCmd); + n->subtype = AT_DistributeBy; + n->def = (Node *)$1; + $$ = (Node *)n; + } + /* ALTER TABLE <name> TO [ NODE (nodelist) | GROUP groupname ] */ + | OptSubClusterInternal + { + AlterTableCmd *n = makeNode(AlterTableCmd); + n->subtype = AT_SubCluster; + n->def = (Node *)$1; + $$ = (Node *)n; + } + /* ALTER TABLE <name> ADD NODE (nodelist) */ + | ADD_P NODE pgxcnodes + { + AlterTableCmd *n = makeNode(AlterTableCmd); + n->subtype = AT_AddNodeList; + n->def = (Node *)$3; + $$ = (Node *)n; + } + /* ALTER TABLE <name> DELETE NODE (nodelist) */ + | DELETE_P NODE pgxcnodes + { + AlterTableCmd *n = makeNode(AlterTableCmd); + n->subtype = AT_DeleteNodeList; + n->def = (Node *)$3; + $$ = (Node *)n; + } +/* PGXC_END */ ; alter_column_default: diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index edd4a104e0..f98e6ea59d 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -93,7 +93,8 @@ typedef struct IndexStmt *pkey; /* PRIMARY KEY index, if any */ #ifdef PGXC char *fallback_dist_col; /* suggested column to distribute on */ - DistributeBy *distributeby; /* original distribute by column in create table */ + DistributeBy *distributeby; /* original distribute by column of CREATE TABLE */ + PGXCSubCluster *subcluster; /* original subcluster option of CREATE TABLE */ #endif } CreateStmtContext; @@ -2415,6 +2416,7 @@ transformAlterTableStmt(AlterTableStmt *stmt, const char *queryString) #ifdef PGXC cxt.fallback_dist_col = NULL; cxt.distributeby = NULL; + cxt.subcluster = NULL; #endif /* diff --git a/src/backend/pgxc/copy/Makefile b/src/backend/pgxc/copy/Makefile index a8cfbd86da..2ddcc904b3 100644 --- a/src/backend/pgxc/copy/Makefile +++ b/src/backend/pgxc/copy/Makefile @@ -14,6 +14,6 @@ subdir = src/backend/pgxc/copy top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = remotecopy.o +OBJS = copyops.o remotecopy.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/copy/copyops.c b/src/backend/pgxc/copy/copyops.c new file mode 100644 index 0000000000..a85a06cc09 --- /dev/null +++ b/src/backend/pgxc/copy/copyops.c @@ -0,0 +1,496 @@ +/*------------------------------------------------------------------------- + * + * copyops.c + * Functions related to remote COPY data manipulation and materialization + * of data redistribution + * + * Copyright (c) 2010-2012 Postgres-XC Development Group + * + * + * IDENTIFICATION + * src/backend/pgxc/copy/copyops.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "miscadmin.h" +#include "fmgr.h" +#include "lib/stringinfo.h" +#include "mb/pg_wchar.h" +#include "pgxc/copyops.h" +#include "utils/lsyscache.h" + +/* NULL print marker */ +#define COPYOPS_NULL_PRINT "\\N" + +/* Some octal operations */ +#define ISOCTAL(c) (((c) >= '0') && ((c) <= '7')) +#define OCTVALUE(c) ((c) - '0') +/* Send text representation of one attribute, with conversion and escaping */ +#define DUMPSOFAR() \ + do { \ + if (ptr > start) \ + appendBinaryStringInfo(buf, (char *) start, ptr - start); \ + } while (0) + + +static int get_decimal_from_hex(char hex); +static void attribute_out_text(StringInfo buf, char *string); + +/* + * Return decimal value for a hexadecimal digit + */ +static int +get_decimal_from_hex(char hex) +{ + if (isdigit((unsigned char) hex)) + return hex - '0'; + else + return tolower((unsigned char) hex) - 'a' + 10; +} + + +/* + * Output an attribute to text + * This takes portions of the code of CopyAttributeOutText + */ +static void +attribute_out_text(StringInfo buf, char *string) +{ + char *ptr; + char c; + char *start; + char delimc = COPYOPS_DELIMITER; + bool need_transcoding, encoding_embeds_ascii; + int file_encoding = pg_get_client_encoding(); + + need_transcoding = (file_encoding != GetDatabaseEncoding() || + pg_database_encoding_max_length() > 1); + encoding_embeds_ascii = PG_ENCODING_IS_CLIENT_ONLY(file_encoding); + + if (need_transcoding) + ptr = pg_server_to_any(string, strlen(string), file_encoding); + else + ptr = string; + + /* + * We have to grovel through the string searching for control characters + * and instances of the delimiter character. In most cases, though, these + * are infrequent. To avoid overhead from calling CopySendData once per + * character, we dump out all characters between escaped characters in a + * single call. The loop invariant is that the data from "start" to "ptr" + * can be sent literally, but hasn't yet been. + * + * We can skip pg_encoding_mblen() overhead when encoding is safe, because + * in valid backend encodings, extra bytes of a multibyte character never + * look like ASCII. This loop is sufficiently performance-critical that + * it's worth making two copies of it to get the IS_HIGHBIT_SET() test out + * of the normal safe-encoding path. + */ + if (encoding_embeds_ascii) + { + start = ptr; + while ((c = *ptr) != '\0') + { + if ((unsigned char) c < (unsigned char) 0x20) + { + /* + * \r and \n must be escaped, the others are traditional. We + * prefer to dump these using the C-like notation, rather than + * a backslash and the literal character, because it makes the + * dump file a bit more proof against Microsoftish data + * mangling. + */ + switch (c) + { + case '\b': + c = 'b'; + break; + case '\f': + c = 'f'; + break; + case '\n': + c = 'n'; + break; + case '\r': + c = 'r'; + break; + case '\t': + c = 't'; + break; + case '\v': + c = 'v'; + break; + default: + /* If it's the delimiter, must backslash it */ + if (c == delimc) + break; + /* All ASCII control chars are length 1 */ + ptr++; + continue; /* fall to end of loop */ + } + + /* if we get here, we need to convert the control char */ + DUMPSOFAR(); + appendStringInfoCharMacro(buf, '\\'); + appendStringInfoCharMacro(buf, c); + start = ++ptr; + } + else if (c == '\\' || c == delimc) + { + DUMPSOFAR(); + appendStringInfoCharMacro(buf, '\\'); + start = ++ptr; + } + else if (IS_HIGHBIT_SET(c)) + ptr += pg_encoding_mblen(file_encoding, ptr); + else + ptr++; + } + } + else + { + start = ptr; + while ((c = *ptr) != '\0') + { + if ((unsigned char) c < (unsigned char) 0x20) + { + /* + * \r and \n must be escaped, the others are traditional. We + * prefer to dump these using the C-like notation, rather than + * a backslash and the literal character, because it makes the + * dump file a bit more proof against Microsoftish data + * mangling. + */ + switch (c) + { + case '\b': + c = 'b'; + break; + case '\f': + c = 'f'; + break; + case '\n': + c = 'n'; + break; + case '\r': + c = 'r'; + break; + case '\t': + c = 't'; + break; + case '\v': + c = 'v'; + break; + default: + /* If it's the delimiter, must backslash it */ + if (c == delimc) + break; + /* All ASCII control chars are length 1 */ + ptr++; + continue; /* fall to end of loop */ + } + /* if we get here, we need to convert the control char */ + DUMPSOFAR(); + appendStringInfoCharMacro(buf, '\\'); + appendStringInfoCharMacro(buf, c); + start = ++ptr; + } + else if (c == '\\' || c == delimc) + { + DUMPSOFAR(); + appendStringInfoCharMacro(buf, '\\'); + start = ++ptr; + } + else + ptr++; + } + } + + DUMPSOFAR(); +} + + +/* + * CopyOps_RawDataToArrayField + * Convert the raw output of COPY TO to an array of fields. + * This is a simplified version of CopyReadAttributesText used for data + * redistribution and storage of tuple data into a tuple store. + */ +char ** +CopyOps_RawDataToArrayField(TupleDesc tupdesc, char *message, int len) +{ + char delimc = COPYOPS_DELIMITER; + int fieldno; + int null_print_len = strlen(COPYOPS_NULL_PRINT); + char *origin_ptr; + char *output_ptr; + char *cur_ptr; + char *line_end_ptr; + int fields = tupdesc->natts; + char **raw_fields; + Form_pg_attribute *attr = tupdesc->attrs; + + /* Adjust number of fields depending on dropped attributes */ + for (fieldno = 0; fieldno < tupdesc->natts; fieldno++) + { + if (attr[fieldno]->attisdropped) + fields--; + } + + /* Then alloc necessary space */ + raw_fields = (char **) palloc(fields * sizeof(char *)); + + /* Take a copy of message to manipulate */ + origin_ptr = (char *) palloc0(sizeof(char) * (len + 1)); + memcpy(origin_ptr, message, len + 1); + + /* Add clean separator '\0' at the end of message */ + origin_ptr[len] = '\0'; + + /* Keep track of original pointer */ + output_ptr = origin_ptr; + + /* set pointer variables for loop */ + cur_ptr = message; + line_end_ptr = message + len; + + /* Outer loop iterates over fields */ + fieldno = 0; + for (;;) + { + char *start_ptr; + char *end_ptr; + int input_len; + bool found_delim = false; + bool saw_non_ascii = false; + + /* Make sure there is enough space for the next value */ + if (fieldno >= fields) + { + fields *= 2; + raw_fields = repalloc(raw_fields, fields * sizeof(char *)); + } + + /* Remember start of field on output side */ + start_ptr = cur_ptr; + raw_fields[fieldno] = output_ptr; + + /* Scan data for field */ + for (;;) + { + char c; + + end_ptr = cur_ptr; + if (cur_ptr >= line_end_ptr) + break; + c = *cur_ptr++; + if (c == delimc) + { + found_delim = true; + break; + } + if (c == '\\') + { + if (cur_ptr >= line_end_ptr) + break; + c = *cur_ptr++; + switch (c) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + { + /* handle \013 */ + int val; + + val = OCTVALUE(c); + if (cur_ptr < line_end_ptr) + { + c = *cur_ptr; + if (ISOCTAL(c)) + { + cur_ptr++; + val = (val << 3) + OCTVALUE(c); + if (cur_ptr < line_end_ptr) + { + c = *cur_ptr; + if (ISOCTAL(c)) + { + cur_ptr++; + val = (val << 3) + OCTVALUE(c); + } + } + } + } + c = val & 0377; + if (c == '\0' || IS_HIGHBIT_SET(c)) + saw_non_ascii = true; + } + break; + case 'x': + /* Handle \x3F */ + if (cur_ptr < line_end_ptr) + { + char hexchar = *cur_ptr; + + if (isxdigit((unsigned char) hexchar)) + { + int val = get_decimal_from_hex(hexchar); + + cur_ptr++; + if (cur_ptr < line_end_ptr) + { + hexchar = *cur_ptr; + if (isxdigit((unsigned char) hexchar)) + { + cur_ptr++; + val = (val << 4) + get_decimal_from_hex(hexchar); + } + } + c = val & 0xff; + if (c == '\0' || IS_HIGHBIT_SET(c)) + saw_non_ascii = true; + } + } + break; + case 'b': + c = '\b'; + break; + case 'f': + c = '\f'; + break; + case 'n': + c = '\n'; + break; + case 'r': + c = '\r'; + break; + case 't': + c = '\t'; + break; + case 'v': + c = '\v'; + break; + + /* + * in all other cases, take the char after '\' + * literally + */ + } + } + + /* Add c to output string */ + *output_ptr++ = c; + } + + /* Terminate attribute value in output area */ + *output_ptr++ = '\0'; + + /* + * If we de-escaped a non-7-bit-ASCII char, make sure we still have + * valid data for the db encoding. Avoid calling strlen here for the + * sake of efficiency. + */ + if (saw_non_ascii) + { + char *fld = raw_fields[fieldno]; + + pg_verifymbstr(fld, output_ptr - (fld + 1), false); + } + + /* Check whether raw input matched null marker */ + input_len = end_ptr - start_ptr; + if (input_len == null_print_len && + strncmp(start_ptr, COPYOPS_NULL_PRINT, input_len) == 0) + raw_fields[fieldno] = NULL; + + fieldno++; + /* Done if we hit EOL instead of a delim */ + if (!found_delim) + break; + } + + /* Clean up state of attribute_buf */ + output_ptr--; + Assert(*output_ptr == '\0'); + + return raw_fields; +} + +/* + * CopyOps_BuildOneRowTo + * Build one row message to be sent to remote nodes through COPY protocol + */ +char * +CopyOps_BuildOneRowTo(TupleDesc tupdesc, Datum *values, bool *nulls, int *len) +{ + bool need_delim = false; + char *res; + int i; + FmgrInfo *out_functions; + Form_pg_attribute *attr = tupdesc->attrs; + StringInfo buf; + + /* Get info about the columns we need to process. */ + out_functions = (FmgrInfo *) palloc(tupdesc->natts * sizeof(FmgrInfo)); + for (i = 0; i < tupdesc->natts; i++) + { + Oid out_func_oid; + bool isvarlena; + + /* Do not need any information for dropped attributes */ + if (attr[i]->attisdropped) + continue; + + getTypeOutputInfo(attr[i]->atttypid, + &out_func_oid, + &isvarlena); + fmgr_info(out_func_oid, &out_functions[i]); + } + + /* Initialize output buffer */ + buf = makeStringInfo(); + + for (i = 0; i < tupdesc->natts; i++) + { + Datum value = values[i]; + bool isnull = nulls[i]; + + /* Do not need any information for dropped attributes */ + if (attr[i]->attisdropped) + continue; + + if (need_delim) + appendStringInfoCharMacro(buf, COPYOPS_DELIMITER); + need_delim = true; + + if (isnull) + { + /* Null print value to client */ + appendBinaryStringInfo(buf, "\\N", strlen("\\N")); + } + else + { + char *string; + string = OutputFunctionCall(&out_functions[i], + value); + attribute_out_text(buf, string); + pfree(string); + } + } + + /* Record length of message */ + *len = buf->len; + res = pstrdup(buf->data); + pfree(out_functions); + pfree(buf->data); + pfree(buf); + return res; +} diff --git a/src/backend/pgxc/copy/remotecopy.c b/src/backend/pgxc/copy/remotecopy.c index 8c3eba0bff..5c0299dc64 100644 --- a/src/backend/pgxc/copy/remotecopy.c +++ b/src/backend/pgxc/copy/remotecopy.c @@ -167,7 +167,6 @@ RemoteCopy_BuildStatement(RemoteCopyData *state, else appendStringInfoString(&state->query_buf, " TO STDOUT"); - if (options->rco_binary) appendStringInfoString(&state->query_buf, " BINARY"); @@ -201,7 +200,6 @@ RemoteCopy_BuildStatement(RemoteCopyData *state, * It is not necessary to send the HEADER part to Datanodes. * Sending data is sufficient. */ - if (options->rco_quote && options->rco_quote[0] != '"') { appendStringInfoString(&state->query_buf, " QUOTE AS "); @@ -245,6 +243,26 @@ RemoteCopy_BuildStatement(RemoteCopyData *state, /* + * Build a default set for RemoteCopyOptions + */ +RemoteCopyOptions * +makeRemoteCopyOptions(void) +{ + RemoteCopyOptions *res = (RemoteCopyOptions *) palloc(sizeof(RemoteCopyOptions)); + res->rco_binary = false; + res->rco_oids = false; + res->rco_csv_mode = false; + res->rco_delim = NULL; + res->rco_null_print = NULL; + res->rco_quote = NULL; + res->rco_escape = NULL; + res->rco_force_quote = NIL; + res->rco_force_notnull = NIL; + return res; +} + + +/* * FreeRemoteCopyOptions * Free remote COPY options structure */ diff --git a/src/backend/pgxc/locator/Makefile b/src/backend/pgxc/locator/Makefile index 107fe0f601..66c4c50d2d 100644 --- a/src/backend/pgxc/locator/Makefile +++ b/src/backend/pgxc/locator/Makefile @@ -1,7 +1,7 @@ #------------------------------------------------------------------------- # # Makefile-- -# Makefile for locator +# Makefile for locator and data distribution # # Copyright(C) 2010-2012 Postgres-XC Development Group # @@ -14,6 +14,6 @@ subdir = src/backend/pgxc/locator top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = locator.o +OBJS = locator.o redistrib.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index feab0a1f9e..b5b920a443 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -440,7 +440,6 @@ IsModuloColumn(RelationLocInfo *rel_loc_info, char *part_col_name) /* * IsModuloColumnForRelId - return whether or not column for relation is used for modulo distribution. - * */ bool IsModuloColumnForRelId(Oid relid, char *part_col_name) @@ -502,6 +501,42 @@ IsTableDistOnPrimary(RelationLocInfo *rel_loc_info) return false; } + +/* + * IsLocatorInfoEqual + * Check equality of given locator information + */ +bool +IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2) +{ + List *nodeList1, *nodeList2; + Assert(rel_loc_info1 && rel_loc_info2); + + nodeList1 = rel_loc_info1->nodeList; + nodeList2 = rel_loc_info2->nodeList; + + /* Same relation? */ + if (rel_loc_info1->relid != rel_loc_info2->relid) + return false; + + /* Same locator type? */ + if (rel_loc_info1->locatorType != rel_loc_info2->locatorType) + return false; + + /* Same attribute number? */ + if (rel_loc_info1->partAttrNum != rel_loc_info2->partAttrNum) + return false; + + /* Same node list? */ + if (list_difference_int(nodeList1, nodeList2) != NIL || + list_difference_int(nodeList2, nodeList1) != NIL) + return false; + + /* Everything is equal */ + return true; +} + + /* * GetRelationNodes * diff --git a/src/backend/pgxc/locator/redistrib.c b/src/backend/pgxc/locator/redistrib.c new file mode 100644 index 0000000000..264f01b1d1 --- /dev/null +++ b/src/backend/pgxc/locator/redistrib.c @@ -0,0 +1,871 @@ +/*------------------------------------------------------------------------- + * + * redistrib.c + * Routines related to online data redistribution + * + * Copyright (c) 2010-2012 Postgres-XC Development Group + * + * + * IDENTIFICATION + * src/backend/pgxc/locator/redistrib.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "miscadmin.h" + +#include "access/hash.h" +#include "access/htup.h" +#include "access/xact.h" +#include "catalog/pg_type.h" +#include "catalog/pgxc_node.h" +#include "commands/tablecmds.h" +#include "pgxc/copyops.h" +#include "pgxc/execRemote.h" +#include "pgxc/pgxc.h" +#include "pgxc/redistrib.h" +#include "pgxc/remotecopy.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +#define IsCommandTypePreUpdate(x) (x == CATALOG_UPDATE_BEFORE || \ + x == CATALOG_UPDATE_BOTH) +#define IsCommandTypePostUpdate(x) (x == CATALOG_UPDATE_AFTER || \ + x == CATALOG_UPDATE_BOTH) + +/* Functions used for the execution of redistribution commands */ +static void distrib_execute_query(char *sql, bool is_temp, ExecNodes *exec_nodes); +static void distrib_execute_command(RedistribState *distribState, RedistribCommand *command); +static void distrib_copy_to(RedistribState *distribState); +static void distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes); +static void distrib_truncate(RedistribState *distribState, ExecNodes *exec_nodes); +static void distrib_reindex(RedistribState *distribState, ExecNodes *exec_nodes); +static void distrib_delete_hash(RedistribState *distribState, ExecNodes *exec_nodes); + +/* Functions used to build the command list */ +static void pgxc_redist_build_entry(RedistribState *distribState, + RelationLocInfo *oldLocInfo, + RelationLocInfo *newLocInfo); +static void pgxc_redist_build_replicate(RedistribState *distribState, + RelationLocInfo *oldLocInfo, + RelationLocInfo *newLocInfo); +static void pgxc_redist_build_replicate_to_distrib(RedistribState *distribState, + RelationLocInfo *oldLocInfo, + RelationLocInfo *newLocInfo); + +static void pgxc_redist_build_default(RedistribState *distribState); +static void pgxc_redist_add_reindex(RedistribState *distribState); + + +/* + * PGXCRedistribTable + * Execute redistribution operations after catalog update + */ +void +PGXCRedistribTable(RedistribState *distribState, RedistribCatalog type) +{ + ListCell *item; + + /* Nothing to do if no redistribution operation */ + if (!distribState) + return; + + /* Nothing to do if on remote node */ + if (IS_PGXC_DATANODE || IsConnFromCoord()) + return; + + /* Execute each command if necessary */ + foreach(item, distribState->commands) + { + RedistribCommand *command = (RedistribCommand *)lfirst(item); + + /* Check if command can be run */ + if (!IsCommandTypePostUpdate(type) && + IsCommandTypePostUpdate(command->updateState)) + continue; + if (!IsCommandTypePreUpdate(type) && + IsCommandTypePreUpdate(command->updateState)) + continue; + + /* Now enter in execution list */ + distrib_execute_command(distribState, command); + } +} + + +/* + * PGXCRedistribCreateCommandList + * Look for the list of necessary commands to perform table redistribution. + */ +void +PGXCRedistribCreateCommandList(RedistribState *distribState, RelationLocInfo *newLocInfo) +{ + Relation rel; + RelationLocInfo *oldLocInfo; + + rel = relation_open(distribState->relid, NoLock); + oldLocInfo = RelationGetLocInfo(rel); + + /* Build redistribution command list */ + pgxc_redist_build_entry(distribState, oldLocInfo, newLocInfo); + + relation_close(rel, NoLock); +} + + +/* + * pgxc_redist_build_entry + * Entry point for command list building + */ +static void +pgxc_redist_build_entry(RedistribState *distribState, + RelationLocInfo *oldLocInfo, + RelationLocInfo *newLocInfo) +{ + /* If distribution has not changed at all, nothing to do */ + if (IsLocatorInfoEqual(oldLocInfo, newLocInfo)) + return; + + /* Evaluate cases for replicated tables */ + pgxc_redist_build_replicate(distribState, oldLocInfo, newLocInfo); + + /* Evaluate cases for replicated to distributed tables */ + pgxc_redist_build_replicate_to_distrib(distribState, oldLocInfo, newLocInfo); + + /* PGXCTODO: perform more complex builds of command list */ + + /* Fallback to default */ + pgxc_redist_build_default(distribState); +} + + +/* + * pgxc_redist_build_replicate_to_distrib + * Build redistribution command list from replicated to distributed + * table. + */ +static void +pgxc_redist_build_replicate_to_distrib(RedistribState *distribState, + RelationLocInfo *oldLocInfo, + RelationLocInfo *newLocInfo) +{ + List *removedNodes; + List *newNodes; + + /* If a command list has already been built, nothing to do */ + if (list_length(distribState->commands) != 0) + return; + + /* Redistribution is done from replication to distributed (with value) */ + if (!IsLocatorReplicated(oldLocInfo->locatorType) || + !IsLocatorDistributedByValue(newLocInfo->locatorType)) + return; + + /* Get the list of nodes that are added to the relation */ + removedNodes = list_difference_int(oldLocInfo->nodeList, newLocInfo->nodeList); + + /* Get the list of nodes that are removed from relation */ + newNodes = list_difference_int(newLocInfo->nodeList, oldLocInfo->nodeList); + + /* + * If some nodes are added, turn back to default, we need to fetch data + * and then redistribute it properly. + */ + if (newNodes != NIL) + return; + + /* Nodes removed have to be truncated, so add a TRUNCATE commands to removed nodes */ + if (removedNodes != NIL) + { + ExecNodes *execNodes = makeNode(ExecNodes); + execNodes->nodeList = removedNodes; + /* Add TRUNCATE command */ + distribState->commands = lappend(distribState->commands, + makeRedistribCommand(DISTRIB_TRUNCATE, CATALOG_UPDATE_BEFORE, execNodes)); + } + + /* + * If the table is redistributed to a single node, a TRUNCATE on removed nodes + * is sufficient so leave here. + */ + if (list_length(newLocInfo->nodeList) == 1) + { + /* Add REINDEX command if necessary */ + pgxc_redist_add_reindex(distribState); + return; + } + + /* + * If we are here we are sure that redistribution only requires to delete data on remote + * nodes on the new subset of nodes. So launch to remote nodes a DELETE command that only + * eliminates the data not verifying the new hashing condition. + */ + if (newLocInfo->locatorType == LOCATOR_TYPE_HASH) + { + ExecNodes *execNodes = makeNode(ExecNodes); + execNodes->nodeList = newLocInfo->nodeList; + distribState->commands = lappend(distribState->commands, + makeRedistribCommand(DISTRIB_DELETE_HASH, CATALOG_UPDATE_AFTER, execNodes)); + } + else if (newLocInfo->locatorType == LOCATOR_TYPE_MODULO) + { + ExecNodes *execNodes = makeNode(ExecNodes); + execNodes->nodeList = newLocInfo->nodeList; + distribState->commands = lappend(distribState->commands, + makeRedistribCommand(DISTRIB_DELETE_MODULO, CATALOG_UPDATE_AFTER, execNodes)); + } + else + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("Incorrect redistribution operation"))); + + /* Add REINDEX command if necessary */ + pgxc_redist_add_reindex(distribState); +} + + +/* + * pgxc_redist_build_replicate + * Build redistribution command list for replicated tables + */ +static void +pgxc_redist_build_replicate(RedistribState *distribState, + RelationLocInfo *oldLocInfo, + RelationLocInfo *newLocInfo) +{ + List *removedNodes; + List *newNodes; + + /* If a command list has already been built, nothing to do */ + if (list_length(distribState->commands) != 0) + return; + + /* Case of a replicated table whose set of nodes is changed */ + if (!IsLocatorReplicated(newLocInfo->locatorType) || + !IsLocatorReplicated(oldLocInfo->locatorType)) + return; + + /* Get the list of nodes that are added to the relation */ + removedNodes = list_difference_int(oldLocInfo->nodeList, newLocInfo->nodeList); + + /* Get the list of nodes that are removed from relation */ + newNodes = list_difference_int(newLocInfo->nodeList, oldLocInfo->nodeList); + + /* + * If nodes have to be added, we need to fetch data for redistribution first. + * So add a COPY TO command to fetch data. + */ + if (newNodes != NIL) + { + /* Add COPY TO command */ + distribState->commands = lappend(distribState->commands, + makeRedistribCommand(DISTRIB_COPY_TO, CATALOG_UPDATE_BEFORE, NULL)); + } + + /* Nodes removed have to be truncated, so add a TRUNCATE commands to removed nodes */ + if (removedNodes != NIL) + { + ExecNodes *execNodes = makeNode(ExecNodes); + execNodes->nodeList = removedNodes; + /* Add TRUNCATE command */ + distribState->commands = lappend(distribState->commands, + makeRedistribCommand(DISTRIB_TRUNCATE, CATALOG_UPDATE_BEFORE, execNodes)); + } + + /* If necessary, COPY the data obtained at first step to the new nodes. */ + if (newNodes != NIL) + { + ExecNodes *execNodes = makeNode(ExecNodes); + execNodes->nodeList = newNodes; + /* Add COPY FROM command */ + distribState->commands = lappend(distribState->commands, + makeRedistribCommand(DISTRIB_COPY_FROM, CATALOG_UPDATE_AFTER, execNodes)); + } + + /* Add REINDEX command if necessary */ + pgxc_redist_add_reindex(distribState); +} + + +/* + * pgxc_redist_build_default + * Build a default list consisting of + * COPY TO -> TRUNCATE -> COPY FROM ( -> REINDEX ) + */ +static void +pgxc_redist_build_default(RedistribState *distribState) +{ + /* If a command list has already been built, nothing to do */ + if (list_length(distribState->commands) != 0) + return; + + /* COPY TO command */ + distribState->commands = lappend(distribState->commands, + makeRedistribCommand(DISTRIB_COPY_TO, CATALOG_UPDATE_BEFORE, NULL)); + /* TRUNCATE command */ + distribState->commands = lappend(distribState->commands, + makeRedistribCommand(DISTRIB_TRUNCATE, CATALOG_UPDATE_BEFORE, NULL)); + /* COPY FROM command */ + distribState->commands = lappend(distribState->commands, + makeRedistribCommand(DISTRIB_COPY_FROM, CATALOG_UPDATE_AFTER, NULL)); + + /* REINDEX command */ + pgxc_redist_add_reindex(distribState); +} + + +/* + * pgxc_redist_build_reindex + * Add a reindex command if necessary + */ +static void +pgxc_redist_add_reindex(RedistribState *distribState) +{ + Relation rel; + + rel = relation_open(distribState->relid, NoLock); + + /* Build REINDEX command if necessary */ + if (RelationGetIndexList(rel) != NIL) + { + distribState->commands = lappend(distribState->commands, + makeRedistribCommand(DISTRIB_REINDEX, CATALOG_UPDATE_AFTER, NULL)); + } + + relation_close(rel, NoLock); +} + + +/* + * distrib_execute_command + * Execute a redistribution operation + */ +static void +distrib_execute_command(RedistribState *distribState, RedistribCommand *command) +{ + /* Execute redistribution command */ + switch (command->type) + { + case DISTRIB_COPY_TO: + distrib_copy_to(distribState); + break; + case DISTRIB_COPY_FROM: + distrib_copy_from(distribState, command->execNodes); + break; + case DISTRIB_TRUNCATE: + distrib_truncate(distribState, command->execNodes); + break; + case DISTRIB_REINDEX: + distrib_reindex(distribState, command->execNodes); + break; + case DISTRIB_DELETE_HASH: + case DISTRIB_DELETE_MODULO: + distrib_delete_hash(distribState, command->execNodes); + break; + case DISTRIB_NONE: + default: + Assert(0); /* Should not happen */ + } +} + + +/* + * distrib_copy_to + * Copy all the data of table to be distributed. + * This data is saved in a tuplestore saved in distribution state. + * a COPY FROM operation is always done on nodes determined by the locator data + * in catalogs, explaining why this cannot be done on a subset of nodes. It also + * insures that no read operations are done on nodes where data is not yet located. + */ +static void +distrib_copy_to(RedistribState *distribState) +{ + Oid relOid = distribState->relid; + Relation rel; + RemoteCopyOptions *options; + RemoteCopyData *copyState; + Tuplestorestate *store; /* Storage of redistributed data */ + + /* Fetch necessary data to prepare for the table data acquisition */ + options = makeRemoteCopyOptions(); + + /* All the fields are separated by tabs in redistribution */ + options->rco_delim = palloc(2); + options->rco_delim[0] = COPYOPS_DELIMITER; + options->rco_delim[1] = '\0'; + + copyState = (RemoteCopyData *) palloc0(sizeof(RemoteCopyData)); + copyState->is_from = false; + + /* A sufficient lock level needs to be taken at a higher level */ + rel = relation_open(relOid, NoLock); + RemoteCopy_GetRelationLoc(copyState, rel, NIL); + RemoteCopy_BuildStatement(copyState, rel, options, NIL, NIL); + + /* Inform client of operation being done */ + ereport(DEBUG1, + (errmsg("Copying data for relation \"%s.%s\"", + get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel)))); + + /* Begin the COPY process */ + copyState->connections = DataNodeCopyBegin(copyState->query_buf.data, + copyState->exec_nodes->nodeList, + GetActiveSnapshot()); + + /* Create tuplestore storage */ + store = tuplestore_begin_heap(true, false, work_mem); + + /* Then get rows and copy them to the tuplestore used for redistribution */ + DataNodeCopyOut(copyState->exec_nodes, + copyState->connections, + RelationGetDescr(rel), /* Need also to set up the tuple descriptor */ + NULL, + store, /* Tuplestore used for redistribution */ + REMOTE_COPY_TUPLESTORE); + + /* Do necessary clean-up */ + FreeRemoteCopyOptions(options); + + /* Lock is maintained until transaction commits */ + relation_close(rel, NoLock); + + /* Save results */ + distribState->store = store; +} + + +/* + * PGXCDistribTableCopyFrom + * Execute commands related to COPY FROM + * Redistribute all the data of table with a COPY FROM from given tuplestore. + */ +static void +distrib_copy_from(RedistribState *distribState, ExecNodes *exec_nodes) +{ + Oid relOid = distribState->relid; + Tuplestorestate *store = distribState->store; + Relation rel; + RemoteCopyOptions *options; + RemoteCopyData *copyState; + bool replicated, contains_tuple = true; + TupleDesc tupdesc; + + /* Nothing to do if on remote node */ + if (IS_PGXC_DATANODE || IsConnFromCoord()) + return; + + /* Fetch necessary data to prepare for the table data acquisition */ + options = makeRemoteCopyOptions(); + /* All the fields are separated by tabs in redistribution */ + options->rco_delim = palloc(2); + options->rco_delim[0] = COPYOPS_DELIMITER; + options->rco_delim[1] = '\0'; + + copyState = (RemoteCopyData *) palloc0(sizeof(RemoteCopyData)); + copyState->is_from = true; + + /* A sufficient lock level needs to be taken at a higher level */ + rel = relation_open(relOid, NoLock); + RemoteCopy_GetRelationLoc(copyState, rel, NIL); + RemoteCopy_BuildStatement(copyState, rel, options, NIL, NIL); + + /* + * When building COPY FROM command in redistribution list, + * use the list of nodes that has been calculated there. + * It might be possible that this COPY is done only on a portion of nodes. + */ + if (exec_nodes && exec_nodes->nodeList != NIL) + { + copyState->exec_nodes->nodeList = exec_nodes->nodeList; + copyState->rel_loc->nodeList = exec_nodes->nodeList; + } + + tupdesc = RelationGetDescr(rel); + + /* Inform client of operation being done */ + ereport(DEBUG1, + (errmsg("Redistributing data for relation \"%s.%s\"", + get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel)))); + + /* Begin redistribution on remote nodes */ + copyState->connections = DataNodeCopyBegin(copyState->query_buf.data, + copyState->exec_nodes->nodeList, + GetActiveSnapshot()); + + /* Transform each tuple stored into a COPY message and send it to remote nodes */ + while (contains_tuple) + { + char *data; + int len; + Form_pg_attribute *attr = tupdesc->attrs; + Datum dist_col_value = (Datum) 0; + bool dist_col_is_null = true; + Oid dist_col_type = UNKNOWNOID; + TupleTableSlot *slot; + ExecNodes *local_execnodes; + + /* Build table slot for this relation */ + slot = MakeSingleTupleTableSlot(tupdesc); + + /* Get tuple slot from the tuplestore */ + contains_tuple = tuplestore_gettupleslot(store, true, false, slot); + if (!contains_tuple) + { + ExecDropSingleTupleTableSlot(slot); + break; + } + + /* Make sure the tuple is fully deconstructed */ + slot_getallattrs(slot); + + /* Find value of distribution column if necessary */ + if (copyState->idx_dist_by_col >= 0) + { + dist_col_value = slot->tts_values[copyState->idx_dist_by_col]; + dist_col_is_null = slot->tts_isnull[copyState->idx_dist_by_col]; + dist_col_type = attr[copyState->idx_dist_by_col]->atttypid; + } + + /* Build message to be sent to Datanodes */ + data = CopyOps_BuildOneRowTo(tupdesc, slot->tts_values, slot->tts_isnull, &len); + + /* Build relation node list */ + local_execnodes = GetRelationNodes(copyState->rel_loc, + dist_col_value, + dist_col_is_null, + dist_col_type, + RELATION_ACCESS_INSERT); + /* Take a copy of the node lists so as not to interfere with locator info */ + local_execnodes->primarynodelist = list_copy(local_execnodes->primarynodelist); + local_execnodes->nodeList = list_copy(local_execnodes->nodeList); + + /* Process data to Datanodes */ + DataNodeCopyIn(data, + len, + local_execnodes, + copyState->connections); + + /* Clean up */ + pfree(data); + FreeExecNodes(&local_execnodes); + ExecClearTuple(slot); + ExecDropSingleTupleTableSlot(slot); + } + + /* Finish the redistribution process */ + replicated = copyState->rel_loc->locatorType == LOCATOR_TYPE_REPLICATED; + DataNodeCopyFinish(copyState->connections, + replicated ? PGXCNodeGetNodeId(primary_data_node, PGXC_NODE_DATANODE) : -1, + replicated ? COMBINE_TYPE_SAME : COMBINE_TYPE_SUM); + + /* Lock is maintained until transaction commits */ + relation_close(rel, NoLock); +} + + +/* + * distrib_truncate + * Truncate all the data of specified table. + * This is used as a second step of online data redistribution. + */ +static void +distrib_truncate(RedistribState *distribState, ExecNodes *exec_nodes) +{ + Relation rel; + StringInfo buf; + Oid relOid = distribState->relid; + + /* Nothing to do if on remote node */ + if (IS_PGXC_DATANODE || IsConnFromCoord()) + return; + + /* A sufficient lock level needs to be taken at a higher level */ + rel = relation_open(relOid, NoLock); + + /* Inform client of operation being done */ + ereport(DEBUG1, + (errmsg("Truncating data for relation \"%s.%s\"", + get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel)))); + + /* Initialize buffer */ + buf = makeStringInfo(); + + /* Build query to clean up table before redistribution */ + appendStringInfo(buf, "TRUNCATE %s.%s", + get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel)); + + /* + * Lock is maintained until transaction commits, + * relation needs also to be closed before effectively launching the query. + */ + relation_close(rel, NoLock); + + /* Execute the query */ + distrib_execute_query(buf->data, IsTempTable(relOid), exec_nodes); + + /* Clean buffers */ + pfree(buf->data); + pfree(buf); +} + + +/* + * distrib_reindex + * Reindex the table that has been redistributed + */ +static void +distrib_reindex(RedistribState *distribState, ExecNodes *exec_nodes) +{ + Relation rel; + StringInfo buf; + Oid relOid = distribState->relid; + + /* Nothing to do if on remote node */ + if (IS_PGXC_DATANODE || IsConnFromCoord()) + return; + + /* A sufficient lock level needs to be taken at a higher level */ + rel = relation_open(relOid, NoLock); + + /* Inform client of operation being done */ + ereport(DEBUG1, + (errmsg("Reindexing relation \"%s.%s\"", + get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel)))); + + /* Initialize buffer */ + buf = makeStringInfo(); + + /* Generate the query */ + appendStringInfo(buf, "REINDEX TABLE %s.%s", + get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel)); + + /* Execute the query */ + distrib_execute_query(buf->data, IsTempTable(relOid), exec_nodes); + + /* Clean buffers */ + pfree(buf->data); + pfree(buf); + + /* Lock is maintained until transaction commits */ + relation_close(rel, NoLock); +} + + +/* + * distrib_delete_hash + * Perform a partial tuple deletion of remote tuples not checking the correct hash + * condition. The new distribution condition is set up in exec_nodes when building + * the command list. + */ +static void +distrib_delete_hash(RedistribState *distribState, ExecNodes *exec_nodes) +{ + Relation rel; + StringInfo buf; + Oid relOid = distribState->relid; + ListCell *item; + + /* Nothing to do if on remote node */ + if (IS_PGXC_DATANODE || IsConnFromCoord()) + return; + + /* A sufficient lock level needs to be taken at a higher level */ + rel = relation_open(relOid, NoLock); + + /* Inform client of operation being done */ + ereport(DEBUG1, + (errmsg("Deleting necessary tuples \"%s.%s\"", + get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel)))); + + /* Initialize buffer */ + buf = makeStringInfo(); + + /* Build query to clean up table before redistribution */ + appendStringInfo(buf, "DELETE FROM %s.%s", + get_namespace_name(RelationGetNamespace(rel)), + RelationGetRelationName(rel)); + + /* + * Launch the DELETE query to each node as the DELETE depends on + * local conditions for each node. + */ + foreach(item, exec_nodes->nodeList) + { + StringInfo buf2; + char *hashfuncname, *colname; + Oid hashtype; + RelationLocInfo *locinfo = RelationGetLocInfo(rel); + int nodenum = lfirst_int(item); + int nodepos = 0; + ExecNodes *local_exec_nodes = makeNode(ExecNodes); + TupleDesc tupDesc = RelationGetDescr(rel); + Form_pg_attribute *attr = tupDesc->attrs; + ListCell *item2; + + /* Here the query is launched to a unique node */ + local_exec_nodes->nodeList = lappend_int(NIL, nodenum); + + /* Get the hash type of relation */ + hashtype = attr[locinfo->partAttrNum - 1]->atttypid; + + /* Get function hash name */ + hashfuncname = get_compute_hash_function(hashtype, locinfo->locatorType); + + /* Get distribution column name */ + if (locinfo->locatorType == LOCATOR_TYPE_HASH) + colname = GetRelationHashColumn(locinfo); + else if (locinfo->locatorType == LOCATOR_TYPE_MODULO) + colname = GetRelationModuloColumn(locinfo); + else + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("Incorrect redistribution operation"))); + + /* + * Find the correct node position in node list of locator information. + * So scan the node list and fetch the position of node. + */ + foreach(item2, locinfo->nodeList) + { + int loc = lfirst_int(item2); + if (loc == nodenum) + break; + nodepos++; + } + + /* + * Then build the WHERE clause for deletion. + * The condition that allows to keep the tuples on remote nodes + * is of the type "RemoteNodeNumber != abs(hash_func(dis_col)) % NumDatanodes". + * the remote Datanode has no knowledge of its position in cluster so this + * number needs to be compiled locally on Coordinator. + * Taking the absolute value is necessary as hash may return a negative value. + * For hash distributions a condition with correct hash function is used. + * For modulo distribution, well we might need a hash function call but not + * all the time, this is determined implicitely by get_compute_hash_function. + */ + buf2 = makeStringInfo(); + if (hashfuncname) + appendStringInfo(buf2, "%s WHERE abs(%s(%s)) %% %d != %d", + buf->data, hashfuncname, colname, + list_length(locinfo->nodeList), nodepos); + else + appendStringInfo(buf2, "%s WHERE abs(%s) %% %d != %d", buf->data, colname, + list_length(locinfo->nodeList), nodepos); + + /* Then launch this single query */ + distrib_execute_query(buf2->data, IsTempTable(relOid), local_exec_nodes); + + FreeExecNodes(&local_exec_nodes); + pfree(buf2->data); + pfree(buf2); + } + + relation_close(rel, NoLock); + + /* Clean buffers */ + pfree(buf->data); + pfree(buf); +} + + +/* + * makeRedistribState + * Build a distribution state operator + */ +RedistribState * +makeRedistribState(Oid relOid) +{ + RedistribState *res = (RedistribState *) palloc(sizeof(RedistribState)); + res->relid = relOid; + res->commands = NIL; + res->store = NULL; + return res; +} + + +/* + * FreeRedistribState + * Free given distribution state + */ +void +FreeRedistribState(RedistribState *state) +{ + ListCell *item; + + /* Leave if nothing to do */ + if (!state) + return; + + foreach(item, state->commands) + FreeRedistribCommand((RedistribCommand *) lfirst(item)); + if (list_length(state->commands) > 0) + list_free(state->commands); + if (state->store) + tuplestore_clear(state->store); +} + +/* + * makeRedistribCommand + * Build a distribution command + */ +RedistribCommand * +makeRedistribCommand(RedistribOperation type, RedistribCatalog updateState, ExecNodes *nodes) +{ + RedistribCommand *res = (RedistribCommand *) palloc0(sizeof(RedistribCommand)); + res->type = type; + res->updateState = updateState; + res->execNodes = nodes; + return res; +} + +/* + * FreeRedistribCommand + * Free given distribution command + */ +void +FreeRedistribCommand(RedistribCommand *command) +{ + ExecNodes *nodes; + /* Leave if nothing to do */ + if (!command) + return; + nodes = command->execNodes; + + if (nodes) + FreeExecNodes(&nodes); + pfree(command); +} + +/* + * distrib_execute_query + * Execute single raw query on given list of nodes + */ +static void +distrib_execute_query(char *sql, bool is_temp, ExecNodes *exec_nodes) +{ + RemoteQuery *step = makeNode(RemoteQuery); + step->combine_type = COMBINE_TYPE_SAME; + step->exec_nodes = exec_nodes; + step->sql_statement = pstrdup(sql); + step->force_autocommit = false; + + /* Redistribution operations only concern Datanodes */ + step->exec_type = EXEC_ON_DATANODES; + step->is_temp = is_temp; + ExecRemoteUtility(step); + pfree(step->sql_statement); + pfree(step); + + /* Be sure to advance the command counter after the last command */ + CommandCounterIncrement(); +} diff --git a/src/backend/pgxc/nodemgr/nodemgr.c b/src/backend/pgxc/nodemgr/nodemgr.c index 8a28486b5c..68b3e91500 100644 --- a/src/backend/pgxc/nodemgr/nodemgr.c +++ b/src/backend/pgxc/nodemgr/nodemgr.c @@ -4,8 +4,7 @@ * Routines to support manipulation of the pgxc_node catalog * Support concerns CREATE/ALTER/DROP on NODE object. * - * Copyright (c) 1996-2010, PostgreSQL Global Development Group - * Portions Copyright (c) 2010-2012 Postgres-XC Development Group + * Copyright (c) 2010-2012 Postgres-XC Development Group * *------------------------------------------------------------------------- */ diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 9284d9c99f..7fcbebc30d 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -34,6 +34,7 @@ #include "nodes/nodes.h" #include "nodes/nodeFuncs.h" #include "optimizer/var.h" +#include "pgxc/copyops.h" #include "pgxc/nodemgr.h" #include "pgxc/poolmgr.h" #include "storage/ipc.h" @@ -60,7 +61,7 @@ typedef enum RemoteXactNodeStatus RXACT_NODE_NONE, /* Initial state */ RXACT_NODE_PREPARE_SENT, /* PREPARE request sent */ RXACT_NODE_PREPARE_FAILED, /* PREPARE failed on the node */ - RXACT_NODE_PREPARED, /* PREARED successfully on the node */ + RXACT_NODE_PREPARED, /* PREPARED successfully on the node */ RXACT_NODE_COMMIT_SENT, /* COMMIT sent successfully */ RXACT_NODE_COMMIT_FAILED, /* failed to COMMIT on the node */ RXACT_NODE_COMMITTED, /* COMMITTed successfully on the node */ @@ -293,6 +294,7 @@ CreateResponseCombiner(int node_count, CombineType combine_type) combiner->rowBuffer = NIL; combiner->tapenodes = NULL; combiner->initAggregates = true; + combiner->remoteCopyType = REMOTE_COPY_NONE; combiner->copy_file = NULL; combiner->rqs_cmd_id = FirstCommandId; @@ -576,12 +578,98 @@ HandleCopyDataRow(RemoteQueryState *combiner, char *msg_body, size_t len) /* count the row */ combiner->processed++; - /* If there is a copy file, data has to be sent to the local file */ - if (combiner->copy_file) - /* write data to the copy file */ - fwrite(msg_body, 1, len, combiner->copy_file); - else - pq_putmessage('d', msg_body, len); + /* Output remote COPY operation to correct location */ + switch (combiner->remoteCopyType) + { + case REMOTE_COPY_FILE: + /* Write data directly to file */ + fwrite(msg_body, 1, len, combiner->copy_file); + break; + case REMOTE_COPY_STDOUT: + /* Send back data to client */ + pq_putmessage('d', msg_body, len); + break; + case REMOTE_COPY_TUPLESTORE: + { + Datum *values; + bool *nulls; + TupleDesc tupdesc = combiner->tuple_desc; + int i, dropped; + Form_pg_attribute *attr = tupdesc->attrs; + FmgrInfo *in_functions; + Oid *typioparams; + char **fields; + + values = (Datum *) palloc(tupdesc->natts * sizeof(Datum)); + nulls = (bool *) palloc(tupdesc->natts * sizeof(bool)); + in_functions = (FmgrInfo *) palloc(tupdesc->natts * sizeof(FmgrInfo)); + typioparams = (Oid *) palloc(tupdesc->natts * sizeof(Oid)); + + /* Calculate the Oids of input functions */ + for (i = 0; i < tupdesc->natts; i++) + { + Oid in_func_oid; + + /* Do not need any information for dropped attributes */ + if (attr[i]->attisdropped) + continue; + + getTypeInputInfo(attr[i]->atttypid, + &in_func_oid, &typioparams[i]); + fmgr_info(in_func_oid, &in_functions[i]); + } + + /* + * Convert message into an array of fields. + * Last \n is not included in converted message. + */ + fields = CopyOps_RawDataToArrayField(tupdesc, msg_body, len - 1); + + /* Fill in the array values */ + dropped = 0; + for (i = 0; i < tupdesc->natts; i++) + { + char *string = fields[i - dropped]; + /* Do not need any information for dropped attributes */ + if (attr[i]->attisdropped) + { + dropped++; + nulls[i] = true; /* Consider dropped parameter as NULL */ + continue; + } + + /* Find value */ + values[i] = InputFunctionCall(&in_functions[i], + string, + typioparams[i], + attr[i]->atttypmod); + /* Setup value with NULL flag if necessary */ + if (string == NULL) + nulls[i] = true; + else + nulls[i] = false; + } + + /* Then insert the values into tuplestore */ + tuplestore_putvalues(combiner->tuplestorestate, + combiner->tuple_desc, + values, + nulls); + + /* Clean up everything */ + if (*fields) + pfree(*fields); + pfree(fields); + pfree(values); + pfree(nulls); + pfree(in_functions); + pfree(typioparams); + } + break; + case REMOTE_COPY_NONE: + default: + Assert(0); /* Should not happen */ + } } /* @@ -852,7 +940,15 @@ CloseCombiner(RemoteQueryState *combiner) if (combiner->connections) pfree(combiner->connections); if (combiner->tuple_desc) - FreeTupleDesc(combiner->tuple_desc); + { + /* + * In the case of a remote COPY with tuplestore, combiner is not + * responsible from freeing the tuple store. This is done at an upper + * level once data redistribution is completed. + */ + if (combiner->remoteCopyType != REMOTE_COPY_TUPLESTORE) + FreeTupleDesc(combiner->tuple_desc); + } if (combiner->errorMessage) pfree(combiner->errorMessage); if (combiner->errorDetail) @@ -2343,7 +2439,12 @@ DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** } uint64 -DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE* copy_file) +DataNodeCopyOut(ExecNodes *exec_nodes, + PGXCNodeHandle** copy_connections, + TupleDesc tupleDesc, + FILE* copy_file, + Tuplestorestate *store, + RemoteCopyType remoteCopyType) { RemoteQueryState *combiner; int conn_count = list_length(exec_nodes->nodeList) == 0 ? NumDataNodes : list_length(exec_nodes->nodeList); @@ -2352,9 +2453,19 @@ DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE* combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_SUM); combiner->processed = 0; - /* If there is an existing file where to copy data, pass it to combiner */ - if (copy_file) + combiner->remoteCopyType = remoteCopyType; + + /* + * If there is an existing file where to copy data, + * pass it to combiner when remote COPY output is sent back to file. + */ + if (copy_file && remoteCopyType == REMOTE_COPY_FILE) combiner->copy_file = copy_file; + if (store && remoteCopyType == REMOTE_COPY_TUPLESTORE) + { + combiner->tuplestorestate = store; + combiner->tuple_desc = tupleDesc; + } foreach(nodeitem, exec_nodes->nodeList) { diff --git a/src/include/access/hash.h b/src/include/access/hash.h index bc7006dfb3..777a9369aa 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -358,6 +358,7 @@ extern void hash_desc(StringInfo buf, uint8 xl_info, char *rec); #ifdef PGXC extern Datum compute_hash(Oid type, Datum value, char locator); +extern char *get_compute_hash_function(Oid type, char locator); #endif #endif /* HASH_H */ diff --git a/src/include/catalog/pgxc_class.h b/src/include/catalog/pgxc_class.h index 5a0cd597d3..cb540bf584 100644 --- a/src/include/catalog/pgxc_class.h +++ b/src/include/catalog/pgxc_class.h @@ -22,22 +22,37 @@ CATALOG(pgxc_class,9001) BKI_WITHOUT_OIDS typedef FormData_pgxc_class *Form_pgxc_class; -#define Natts_pgxc_class 6 +#define Natts_pgxc_class 6 -#define Anum_pgxc_class_pcrelid 1 +#define Anum_pgxc_class_pcrelid 1 #define Anum_pgxc_class_pclocatortype 2 -#define Anum_pgxc_class_pcattnum 3 +#define Anum_pgxc_class_pcattnum 3 #define Anum_pgxc_class_pchashalgorithm 4 #define Anum_pgxc_class_pchashbuckets 5 -#define Anum_pgxc_class_nodes 6 +#define Anum_pgxc_class_nodes 6 + +typedef enum PgxcClassAlterType +{ + PGXC_CLASS_ALTER_DISTRIBUTION, + PGXC_CLASS_ALTER_NODES, + PGXC_CLASS_ALTER_ALL +} PgxcClassAlterType; extern void PgxcClassCreate(Oid pcrelid, - char pclocatortype, + char pclocatortype, int pcattnum, int pchashalgorithm, int pchashbuckets, int numnodes, Oid *nodes); +extern void PgxcClassAlter(Oid pcrelid, + char pclocatortype, + int pcattnum, + int pchashalgorithm, + int pchashbuckets, + int numnodes, + Oid *nodes, + PgxcClassAlterType type); extern void RemovePgxcClass(Oid pcrelid); #endif /* PGXC_CLASS_H */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index e8f2317c1b..8a837b39d5 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -1244,6 +1244,12 @@ typedef enum AlterTableType AT_DropInherit, /* NO INHERIT parent */ AT_AddOf, /* OF <type_name> */ AT_DropOf, /* NOT OF */ +#ifdef PGXC + AT_DistributeBy, /* DISTRIBUTE BY ... */ + AT_SubCluster, /* TO [ NODE nodelist | GROUP groupname ] */ + AT_AddNodeList, /* ADD NODE nodelist */ + AT_DeleteNodeList, /* DELETE NODE nodelist */ +#endif AT_GenericOptions /* OPTIONS (...) */ } AlterTableType; diff --git a/src/include/pgxc/copyops.h b/src/include/pgxc/copyops.h new file mode 100644 index 0000000000..862dbbd299 --- /dev/null +++ b/src/include/pgxc/copyops.h @@ -0,0 +1,27 @@ +/*-------------------------------------------------------------------------- + * + * copyops.h + * Routines for manipulation of remote COPY data + * + * + * Copyright (c) 2010-2012 Postgres-XC Development Group + * + * + * IDENTIFICATION + * src/include/pgxc/copyops.h + * + *------------------------------------------------------------------------- + */ + +#ifndef COPYOPS_H +#define COPYOPS_H + +#include "access/tupdesc.h" + +/* Type of data delimiter used for data redistribution using remote COPY */ +#define COPYOPS_DELIMITER '\t' + +extern char **CopyOps_RawDataToArrayField(TupleDesc tupdesc, char *message, int len); +extern char *CopyOps_BuildOneRowTo(TupleDesc tupdesc, Datum *values, bool *nulls, int *len); + +#endif diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 32a88ecca4..5e26850d1c 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -48,6 +48,17 @@ typedef enum REQUEST_TYPE_COPY_OUT /* Copy Out response */ } RequestType; +/* + * Type of requests associated to a remote COPY OUT + */ +typedef enum +{ + REMOTE_COPY_NONE, /* Not defined yet */ + REMOTE_COPY_STDOUT, /* Send back to client */ + REMOTE_COPY_FILE, /* Write in file */ + REMOTE_COPY_TUPLESTORE /* Store data in tuplestore */ +} RemoteCopyType; + /* Combines results of INSERT statements using multiple values */ typedef struct CombineTag { @@ -107,7 +118,8 @@ typedef struct RemoteQueryState /* Simple DISTINCT support */ FmgrInfo *eqfunctions; /* functions to compare tuples */ MemoryContext tmp_ctx; /* separate context is needed to compare tuples */ - FILE *copy_file; /* used if copy_dest == COPY_FILE */ + RemoteCopyType remoteCopyType; /* Type of remote COPY operation */ + FILE *copy_file; /* used if remoteCopyType == REMOTE_COPY_FILE */ uint64 processed; /* count of data rows when running CopyOut */ /* cursor support */ char *cursor; /* cursor name */ @@ -136,7 +148,8 @@ extern void PGXCNodeCommitPrepared(char *gid); /* Copy command just involves Datanodes */ extern PGXCNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot); extern int DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections); -extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE* copy_file); +extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, TupleDesc tupleDesc, + FILE* copy_file, Tuplestorestate *store, RemoteCopyType remoteCopyType); extern void DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_dn_index, CombineType combine_type); extern bool DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error); extern int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_connections); diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h index bd719911ea..78ce3cff00 100644 --- a/src/include/pgxc/locator.h +++ b/src/include/pgxc/locator.h @@ -99,6 +99,7 @@ extern RelationLocInfo *GetRelationLocInfo(Oid relid); extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo *src_info); extern char GetRelationLocType(Oid relid); extern bool IsTableDistOnPrimary(RelationLocInfo *rel_loc_info); +extern bool IsLocatorInfoEqual(RelationLocInfo *rel_loc_info1, RelationLocInfo *rel_loc_info2); extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info, Datum valueForDistCol, bool isValueNull, Oid typeOfValueForDistCol, RelationAccessType accessType); diff --git a/src/include/pgxc/redistrib.h b/src/include/pgxc/redistrib.h new file mode 100644 index 0000000000..ee94523dbb --- /dev/null +++ b/src/include/pgxc/redistrib.h @@ -0,0 +1,80 @@ +/*------------------------------------------------------------------------- + * + * redistrib.h + * Routines related to online data redistribution + * + * Copyright (c) 2010-2012 Postgres-XC Development Group + * + * + * IDENTIFICATION + * src/include/pgxc/redistrib.h + * + *------------------------------------------------------------------------- + */ + +#ifndef REDISTRIB_H +#define REDISTRIB_H + +#include "nodes/parsenodes.h" +#include "utils/tuplestore.h" + +/* + * Type of data redistribution operations. + * Online data redistribution is made of one or more of those operations. + */ +typedef enum RedistribOperation { + DISTRIB_NONE, /* Default operation */ + DISTRIB_DELETE_HASH, /* Perform a DELETE with hash value check */ + DISTRIB_DELETE_MODULO, /* Perform a DELETE with modulo value check */ + DISTRIB_COPY_TO, /* Perform a COPY TO */ + DISTRIB_COPY_FROM, /* Perform a COPY FROM */ + DISTRIB_TRUNCATE, /* Truncate relation */ + DISTRIB_REINDEX /* Reindex relation */ +} RedistribOperation; + +/* + * Determine if operation can be done before or after + * catalog update on local node. + */ +typedef enum RedistribCatalog { + CATALOG_UPDATE_NONE, /* Default state */ + CATALOG_UPDATE_AFTER, /* After catalog update */ + CATALOG_UPDATE_BEFORE, /* Before catalog update */ + CATALOG_UPDATE_BOTH /* Before and after catalog update */ +} RedistribCatalog; + +/* + * Redistribution command + * This contains the tools necessary to perform a redistribution operation. + */ +typedef struct RedistribCommand { + RedistribOperation type; /* Operation type */ + ExecNodes *execNodes; /* List of nodes where to perform operation */ + RedistribCatalog updateState; /* Flag to determine if operation can be done + * before or after catalog update */ +} RedistribCommand; + +/* + * Redistribution operation state + * Maintainer of redistribution state having the list of commands + * to be performed during redistribution. + * For the list of commands, we use an array and not a simple list as operations + * might need to be done in a certain order. + */ +typedef struct RedistribState { + Oid relid; /* Oid of relation redistributed */ + List *commands; /* List of commands */ + Tuplestorestate *store; /* Tuple store used for temporary data storage */ +} RedistribState; + +extern void PGXCRedistribTable(RedistribState *distribState, RedistribCatalog type); +extern void PGXCRedistribCreateCommandList(RedistribState *distribState, + RelationLocInfo *newLocInfo); +extern RedistribCommand *makeRedistribCommand(RedistribOperation type, + RedistribCatalog updateState, + ExecNodes *nodes); +extern RedistribState *makeRedistribState(Oid relOid); +extern void FreeRedistribState(RedistribState *state); +extern void FreeRedistribCommand(RedistribCommand *command); + +#endif /* REDISTRIB_H */ diff --git a/src/include/pgxc/remotecopy.h b/src/include/pgxc/remotecopy.h index 77134e71f9..93368c0ada 100644 --- a/src/include/pgxc/remotecopy.h +++ b/src/include/pgxc/remotecopy.h @@ -70,6 +70,7 @@ extern void RemoteCopy_BuildStatement(RemoteCopyData *state, extern void RemoteCopy_GetRelationLoc(RemoteCopyData *state, Relation rel, List *attnums); +extern RemoteCopyOptions *makeRemoteCopyOptions(void); extern void FreeRemoteCopyData(RemoteCopyData *state); extern void FreeRemoteCopyOptions(RemoteCopyOptions *options); #endif diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index fde1467185..4eaabe6592 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -365,6 +365,14 @@ typedef struct StdRdOptions #define RelationUsesTempNamespace(relation) \ ((relation)->rd_rel->relpersistence == RELPERSISTENCE_TEMP) +#ifdef PGXC +/* + * RelationGetLocInfo + * Return the location info of relation + */ +#define RelationGetLocInfo(relation) ((relation)->rd_locator_info) +#endif + /* * RELATION_IS_LOCAL * If a rel is either temp or newly created in the current transaction, diff --git a/src/test/regress/expected/xc_alter_table.out b/src/test/regress/expected/xc_alter_table.out index a798e2f8a8..ca50a710bc 100644 --- a/src/test/regress/expected/xc_alter_table.out +++ b/src/test/regress/expected/xc_alter_table.out @@ -211,3 +211,411 @@ SELECT a, a2, b, c FROM xc_alter_table_2 ORDER BY b; (5 rows) DROP TABLE xc_alter_table_2; +-- Tests for ALTER TABLE redistribution +-- In the following test, a table is redistributed in all the ways possible +-- and effects of redistribution is checked on all the dependent objects +-- Table with integers +CREATE TABLE xc_alter_table_3 (a int, b varchar(10)) DISTRIBUTE BY HASH(a); +INSERT INTO xc_alter_table_3 VALUES (0, NULL); +INSERT INTO xc_alter_table_3 VALUES (1, 'a'); +INSERT INTO xc_alter_table_3 VALUES (2, 'aa'); +INSERT INTO xc_alter_table_3 VALUES (3, 'aaa'); +INSERT INTO xc_alter_table_3 VALUES (4, 'aaaa'); +INSERT INTO xc_alter_table_3 VALUES (5, 'aaaaa'); +INSERT INTO xc_alter_table_3 VALUES (6, 'aaaaaa'); +INSERT INTO xc_alter_table_3 VALUES (7, 'aaaaaaa'); +INSERT INTO xc_alter_table_3 VALUES (8, 'aaaaaaaa'); +INSERT INTO xc_alter_table_3 VALUES (9, 'aaaaaaaaa'); +INSERT INTO xc_alter_table_3 VALUES (10, 'aaaaaaaaaa'); +-- Create some objects to check the effect of redistribution +CREATE VIEW xc_alter_table_3_v AS SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; +CREATE RULE xc_alter_table_3_insert AS ON UPDATE TO xc_alter_table_3 WHERE OLD.a = 11 DO INSERT INTO xc_alter_table_3 VALUES (OLD.a + 1, 'nnn'); +PREPARE xc_alter_table_insert AS INSERT INTO xc_alter_table_3 VALUES ($1, $2); +PREPARE xc_alter_table_delete AS DELETE FROM xc_alter_table_3 WHERE a = $1; +PREPARE xc_alter_table_update AS UPDATE xc_alter_table_3 SET b = $2 WHERE a = $1; +-- Now begin the tests +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +--- + b +(1 row) + +EXECUTE xc_alter_table_update(11, 'bb'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +---- + bb +(1 row) + +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; + b +----- + nnn +(1 row) + +EXECUTE xc_alter_table_delete(12); +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(b); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +--- + b +(1 row) + +EXECUTE xc_alter_table_update(11, 'bb'); +ERROR: Partition column can't be updated in current version +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +--- + b +(1 row) + +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; + b +--- +(0 rows) + +EXECUTE xc_alter_table_delete(12); +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY ROUND ROBIN; +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +--- + b +(1 row) + +EXECUTE xc_alter_table_update(11, 'bb'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +---- + bb +(1 row) + +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; + b +----- + nnn +(1 row) + +EXECUTE xc_alter_table_delete(12); +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY MODULO(a); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +--- + b +(1 row) + +EXECUTE xc_alter_table_update(11, 'bb'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +---- + bb +(1 row) + +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; + b +----- + nnn +(1 row) + +EXECUTE xc_alter_table_delete(12); +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY MODULO(b); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +--- + b +(1 row) + +EXECUTE xc_alter_table_update(11, 'bb'); +ERROR: Partition column can't be updated in current version +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +--- + b +(1 row) + +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; + b +--- +(0 rows) + +EXECUTE xc_alter_table_delete(12); +-- Index and redistribution +CREATE INDEX xc_alter_table_3_index ON xc_alter_table_3(a); +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +--- + b +(1 row) + +EXECUTE xc_alter_table_update(11, 'bb'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; + b +---- + bb +(1 row) + +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; + b +----- + nnn +(1 row) + +EXECUTE xc_alter_table_delete(12); +-- Add column on table +ALTER TABLE xc_alter_table_3 ADD COLUMN c int DEFAULT 4; +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY REPLICATION; +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +-- Drop column on table +ALTER TABLE xc_alter_table_3 DROP COLUMN b; +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +-- Remanipulate table once again and distribute on old column +ALTER TABLE xc_alter_table_3 DROP COLUMN c; +ALTER TABLE xc_alter_table_3 ADD COLUMN b varchar(3) default 'aaa'; +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +-- Change the node list +SELECT alter_table_change_nodes('xc_alter_table_3', '{1}', 'to', NULL); + alter_table_change_nodes +-------------------------- + t +(1 row) + +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +-- Add some nodes on it +SELECT alter_table_change_nodes('xc_alter_table_3', '{2,4,5}', 'add', NULL); + alter_table_change_nodes +-------------------------- + t +(1 row) + +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check in tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +-- Remove some nodes on it +SELECT alter_table_change_nodes('xc_alter_table_3', '{3}', 'add', NULL); + alter_table_change_nodes +-------------------------- + t +(1 row) + +SELECT alter_table_change_nodes('xc_alter_table_3', '{2,3,5}', 'delete', NULL); + alter_table_change_nodes +-------------------------- + t +(1 row) + +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +-- Multiple operations with replication +SELECT alter_table_change_nodes('xc_alter_table_3', '{1,3,4,5}', 'to', 'replication'); + alter_table_change_nodes +-------------------------- + t +(1 row) + +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +-- Manipulate number of nodes to include and remove nodes on a replicated table +-- On removed nodes data is deleted and on new nodes data is added +SELECT alter_table_change_nodes('xc_alter_table_3', '{2,3,5}', 'to', NULL); + alter_table_change_nodes +-------------------------- + t +(1 row) + +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +-- Re-do a double operation with hash this time +SELECT alter_table_change_nodes('xc_alter_table_3', '{2}', 'delete', 'hash(a)'); + alter_table_change_nodes +-------------------------- + t +(1 row) + +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +SELECT * FROM xc_alter_table_3_v; + count | sum | avg +-------+-----+-------------------- + 11 | 55 | 5.0000000000000000 +(1 row) + +-- Error checks +ALTER TABLE xc_alter_table_3 ADD COLUMN b int, DISTRIBUTE BY HASH(a); +ERROR: Incompatible operation with data redistribution +-- Clean up +DROP TABLE xc_alter_table_3 CASCADE; +NOTICE: drop cascades to view xc_alter_table_3_v diff --git a/src/test/regress/expected/xc_create_function.out b/src/test/regress/expected/xc_create_function.out index 41bbcdd256..64d7198513 100644 --- a/src/test/regress/expected/xc_create_function.out +++ b/src/test/regress/expected/xc_create_function.out @@ -43,6 +43,90 @@ begin execute cr_command; end; $$; +-- Add/Delete/change node list of a table +CREATE OR REPLACE FUNCTION alter_table_change_nodes(tab_schema varchar, nodenums int[], command varchar, distribution varchar) +RETURNS BOOLEAN LANGUAGE plpgsql as $$ +declare + cr_command varchar; + nodes varchar[]; + nodename varchar; + nodenames_query varchar; + nodenames varchar; + sep varchar; + nodenum_new int[]; + nodenum_res int[]; + tmp_node int; + num_nodes int; + node int; + check_num boolean; + enforce_to boolean; +BEGIN + -- Check the command type, only delete/add/to are allowed + IF command != 'delete' AND command != 'add' AND command != 'to' THEN + RETURN FALSE; + END IF; + nodenames_query := 'SELECT node_name FROM pgxc_node WHERE node_type = ''D'''; + FOR nodename IN EXECUTE nodenames_query LOOP + nodes := array_append(nodes, nodename); + END LOOP; + nodenames := '('; + sep := ''; + num_nodes := array_length(nodes, 1); + enforce_to := FALSE; + + -- Adjust node array according to total number of nodes + FOREACH node IN ARRAY nodenums LOOP + tmp_node := node; + IF (node < 1 OR node > num_nodes) THEN + -- Enforce the usage of TO here, only safe method + enforce_to := TRUE; + tmp_node := node % num_nodes; + nodenum_new := array_append(nodenum_new, tmp_node); + END IF; + nodenum_new := array_append(nodenum_new, tmp_node); + END LOOP; + -- Eliminate duplicates + nodenum_res := array_append(nodenum_res, nodenum_new[1]); + FOREACH node IN ARRAY nodenum_new LOOP + check_num := TRUE; + FOREACH tmp_node IN ARRAY nodenum_res LOOP + IF (tmp_node = node) THEN + check_num := FALSE; + END IF; + END LOOP; + -- Fill in result array only if not replicated + IF check_num THEN + nodenum_res := array_append(nodenum_res, node); + END IF; + END LOOP; + + -- If there is a unique Datanode in cluster, enforce the use of 'TO NODE' + -- This will avoid any consistency problems + IF (num_nodes = 1 OR enforce_to) THEN + command := 'TO'; + END IF; + + -- Finally build query + cr_command := 'ALTER TABLE ' || tab_schema || ' ' || command || ' NODE '; + FOREACH node IN ARRAY nodenum_res LOOP + IF (node > 0 AND node <= num_nodes) THEN + nodenames := nodenames || sep || nodes[node]; + sep := ', '; + END IF; + END LOOP; + nodenames := nodenames || ')'; + cr_command := cr_command || nodenames; + + -- Add distribution if necessary + IF (distribution IS NOT NULL) then + cr_command := cr_command || ', DISTRIBUTE BY ' || distribution; + END IF; + + -- Launch it + EXECUTE cr_command; + RETURN TRUE; +END; +$$; -- A function to return data node name given a node number CREATE OR REPLACE FUNCTION get_xc_node_name(node_num int) RETURNS varchar LANGUAGE plpgsql AS $$ DECLARE diff --git a/src/test/regress/sql/xc_alter_table.sql b/src/test/regress/sql/xc_alter_table.sql index bfa76fc848..5f78deba77 100644 --- a/src/test/regress/sql/xc_alter_table.sql +++ b/src/test/regress/sql/xc_alter_table.sql @@ -57,3 +57,136 @@ EXPLAIN (VERBOSE true, COSTS false, NODES false) UPDATE xc_alter_table_2 SET a = UPDATE xc_alter_table_2 SET a = 200, a2 = 'CTO' WHERE b = 'John'; SELECT a, a2, b, c FROM xc_alter_table_2 ORDER BY b; DROP TABLE xc_alter_table_2; + +-- Tests for ALTER TABLE redistribution +-- In the following test, a table is redistributed in all the ways possible +-- and effects of redistribution is checked on all the dependent objects +-- Table with integers +CREATE TABLE xc_alter_table_3 (a int, b varchar(10)) DISTRIBUTE BY HASH(a); +INSERT INTO xc_alter_table_3 VALUES (0, NULL); +INSERT INTO xc_alter_table_3 VALUES (1, 'a'); +INSERT INTO xc_alter_table_3 VALUES (2, 'aa'); +INSERT INTO xc_alter_table_3 VALUES (3, 'aaa'); +INSERT INTO xc_alter_table_3 VALUES (4, 'aaaa'); +INSERT INTO xc_alter_table_3 VALUES (5, 'aaaaa'); +INSERT INTO xc_alter_table_3 VALUES (6, 'aaaaaa'); +INSERT INTO xc_alter_table_3 VALUES (7, 'aaaaaaa'); +INSERT INTO xc_alter_table_3 VALUES (8, 'aaaaaaaa'); +INSERT INTO xc_alter_table_3 VALUES (9, 'aaaaaaaaa'); +INSERT INTO xc_alter_table_3 VALUES (10, 'aaaaaaaaaa'); +-- Create some objects to check the effect of redistribution +CREATE VIEW xc_alter_table_3_v AS SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; +CREATE RULE xc_alter_table_3_insert AS ON UPDATE TO xc_alter_table_3 WHERE OLD.a = 11 DO INSERT INTO xc_alter_table_3 VALUES (OLD.a + 1, 'nnn'); +PREPARE xc_alter_table_insert AS INSERT INTO xc_alter_table_3 VALUES ($1, $2); +PREPARE xc_alter_table_delete AS DELETE FROM xc_alter_table_3 WHERE a = $1; +PREPARE xc_alter_table_update AS UPDATE xc_alter_table_3 SET b = $2 WHERE a = $1; + +-- Now begin the tests +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_update(11, 'bb'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; +EXECUTE xc_alter_table_delete(12); +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(b); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_update(11, 'bb'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; +EXECUTE xc_alter_table_delete(12); +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY ROUND ROBIN; +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_update(11, 'bb'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; +EXECUTE xc_alter_table_delete(12); +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY MODULO(a); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_update(11, 'bb'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; +EXECUTE xc_alter_table_delete(12); +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY MODULO(b); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_update(11, 'bb'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; +EXECUTE xc_alter_table_delete(12); +-- Index and redistribution +CREATE INDEX xc_alter_table_3_index ON xc_alter_table_3(a); +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +EXECUTE xc_alter_table_insert(11, 'b'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_update(11, 'bb'); +SELECT b FROM xc_alter_table_3 WHERE a = 11; +EXECUTE xc_alter_table_delete(11); +SELECT b FROM xc_alter_table_3 WHERE a = 11 or a = 12; +EXECUTE xc_alter_table_delete(12); +-- Add column on table +ALTER TABLE xc_alter_table_3 ADD COLUMN c int DEFAULT 4; +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY REPLICATION; +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; +SELECT * FROM xc_alter_table_3_v; +-- Drop column on table +ALTER TABLE xc_alter_table_3 DROP COLUMN b; +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; +SELECT * FROM xc_alter_table_3_v; +-- Remanipulate table once again and distribute on old column +ALTER TABLE xc_alter_table_3 DROP COLUMN c; +ALTER TABLE xc_alter_table_3 ADD COLUMN b varchar(3) default 'aaa'; +ALTER TABLE xc_alter_table_3 DISTRIBUTE BY HASH(a); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +-- Change the node list +SELECT alter_table_change_nodes('xc_alter_table_3', '{1}', 'to', NULL); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +-- Add some nodes on it +SELECT alter_table_change_nodes('xc_alter_table_3', '{2,4,5}', 'add', NULL); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check in tuple presence +SELECT * FROM xc_alter_table_3_v; +-- Remove some nodes on it +SELECT alter_table_change_nodes('xc_alter_table_3', '{3}', 'add', NULL); +SELECT alter_table_change_nodes('xc_alter_table_3', '{2,3,5}', 'delete', NULL); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +-- Multiple operations with replication +SELECT alter_table_change_nodes('xc_alter_table_3', '{1,3,4,5}', 'to', 'replication'); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +-- Manipulate number of nodes to include and remove nodes on a replicated table +-- On removed nodes data is deleted and on new nodes data is added +SELECT alter_table_change_nodes('xc_alter_table_3', '{2,3,5}', 'to', NULL); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +-- Re-do a double operation with hash this time +SELECT alter_table_change_nodes('xc_alter_table_3', '{2}', 'delete', 'hash(a)'); +SELECT count(*), sum(a), avg(a) FROM xc_alter_table_3; -- Check on tuple presence +SELECT * FROM xc_alter_table_3_v; +-- Error checks +ALTER TABLE xc_alter_table_3 ADD COLUMN b int, DISTRIBUTE BY HASH(a); +-- Clean up +DROP TABLE xc_alter_table_3 CASCADE; diff --git a/src/test/regress/sql/xc_create_function.sql b/src/test/regress/sql/xc_create_function.sql index 1c8e2350eb..bd7ad3c8b8 100644 --- a/src/test/regress/sql/xc_create_function.sql +++ b/src/test/regress/sql/xc_create_function.sql @@ -45,6 +45,91 @@ begin end; $$; +-- Add/Delete/change node list of a table +CREATE OR REPLACE FUNCTION alter_table_change_nodes(tab_schema varchar, nodenums int[], command varchar, distribution varchar) +RETURNS BOOLEAN LANGUAGE plpgsql as $$ +declare + cr_command varchar; + nodes varchar[]; + nodename varchar; + nodenames_query varchar; + nodenames varchar; + sep varchar; + nodenum_new int[]; + nodenum_res int[]; + tmp_node int; + num_nodes int; + node int; + check_num boolean; + enforce_to boolean; +BEGIN + -- Check the command type, only delete/add/to are allowed + IF command != 'delete' AND command != 'add' AND command != 'to' THEN + RETURN FALSE; + END IF; + nodenames_query := 'SELECT node_name FROM pgxc_node WHERE node_type = ''D'''; + FOR nodename IN EXECUTE nodenames_query LOOP + nodes := array_append(nodes, nodename); + END LOOP; + nodenames := '('; + sep := ''; + num_nodes := array_length(nodes, 1); + enforce_to := FALSE; + + -- Adjust node array according to total number of nodes + FOREACH node IN ARRAY nodenums LOOP + tmp_node := node; + IF (node < 1 OR node > num_nodes) THEN + -- Enforce the usage of TO here, only safe method + enforce_to := TRUE; + tmp_node := node % num_nodes; + nodenum_new := array_append(nodenum_new, tmp_node); + END IF; + nodenum_new := array_append(nodenum_new, tmp_node); + END LOOP; + -- Eliminate duplicates + nodenum_res := array_append(nodenum_res, nodenum_new[1]); + FOREACH node IN ARRAY nodenum_new LOOP + check_num := TRUE; + FOREACH tmp_node IN ARRAY nodenum_res LOOP + IF (tmp_node = node) THEN + check_num := FALSE; + END IF; + END LOOP; + -- Fill in result array only if not replicated + IF check_num THEN + nodenum_res := array_append(nodenum_res, node); + END IF; + END LOOP; + + -- If there is a unique Datanode in cluster, enforce the use of 'TO NODE' + -- This will avoid any consistency problems + IF (num_nodes = 1 OR enforce_to) THEN + command := 'TO'; + END IF; + + -- Finally build query + cr_command := 'ALTER TABLE ' || tab_schema || ' ' || command || ' NODE '; + FOREACH node IN ARRAY nodenum_res LOOP + IF (node > 0 AND node <= num_nodes) THEN + nodenames := nodenames || sep || nodes[node]; + sep := ', '; + END IF; + END LOOP; + nodenames := nodenames || ')'; + cr_command := cr_command || nodenames; + + -- Add distribution if necessary + IF (distribution IS NOT NULL) then + cr_command := cr_command || ', DISTRIBUTE BY ' || distribution; + END IF; + + -- Launch it + EXECUTE cr_command; + RETURN TRUE; +END; +$$; + -- A function to return data node name given a node number CREATE OR REPLACE FUNCTION get_xc_node_name(node_num int) RETURNS varchar LANGUAGE plpgsql AS $$ DECLARE |