You can subscribe to this list here.
2010 |
Jan
|
Feb
|
Mar
|
Apr
(4) |
May
(28) |
Jun
(12) |
Jul
(11) |
Aug
(12) |
Sep
(5) |
Oct
(19) |
Nov
(14) |
Dec
(12) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2011 |
Jan
(18) |
Feb
(30) |
Mar
(115) |
Apr
(89) |
May
(50) |
Jun
(44) |
Jul
(22) |
Aug
(13) |
Sep
(11) |
Oct
(30) |
Nov
(28) |
Dec
(39) |
2012 |
Jan
(38) |
Feb
(18) |
Mar
(43) |
Apr
(91) |
May
(108) |
Jun
(46) |
Jul
(37) |
Aug
(44) |
Sep
(33) |
Oct
(29) |
Nov
(36) |
Dec
(15) |
2013 |
Jan
(35) |
Feb
(611) |
Mar
(5) |
Apr
(55) |
May
(30) |
Jun
(28) |
Jul
(458) |
Aug
(34) |
Sep
(9) |
Oct
(39) |
Nov
(22) |
Dec
(32) |
2014 |
Jan
(16) |
Feb
(16) |
Mar
(42) |
Apr
(179) |
May
(7) |
Jun
(6) |
Jul
(9) |
Aug
|
Sep
(4) |
Oct
|
Nov
(3) |
Dec
|
2015 |
Jan
|
Feb
|
Mar
|
Apr
(2) |
May
(4) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
S | M | T | W | T | F | S |
---|---|---|---|---|---|---|
|
|
|
1
|
2
|
3
|
4
|
5
|
6
(1) |
7
|
8
|
9
|
10
|
11
|
12
|
13
|
14
(1) |
15
|
16
|
17
|
18
|
19
|
20
|
21
|
22
(1) |
23
|
24
|
25
|
26
(1) |
27
(1) |
28
|
29
|
30
|
|
|
From: mason_s <ma...@us...> - 2010-09-27 05:18:13
|
Project "Postgres-XC". The branch, master has been updated via e4978385ac1e81be3b95fe51656a0a166cfc22fb (commit) from c3e87d496dbf75651197f03b36d1cf0ba4ea7f0c (commit) - Log ----------------------------------------------------------------- commit e4978385ac1e81be3b95fe51656a0a166cfc22fb Author: Mason Sharp <ma...@us...> Date: Mon Sep 27 14:10:48 2010 +0900 Handle stored functions in queries. If the stored function is IMMUTABLE and appears in a query, it can be safely executed on the data nodes and is pushed down. Otherwise, the stored function must be executed on the coordinator. Note that stored functions cannot yet contain queries that use passed in parameters until we add support for prepared statements with parameters (planned to be done within the next few months). diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index a7bc0ab..a88179b 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -33,6 +33,7 @@ #include "parser/parse_coerce.h" #include "pgxc/locator.h" #include "pgxc/planner.h" +#include "tcop/pquery.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/fmgroids.h" @@ -139,7 +140,6 @@ bool StrictStatementChecking = true; /* Forbid multi-node SELECT statements with an ORDER BY clause */ bool StrictSelectChecking = false; - static ExecNodes *get_plan_nodes(Query *query, bool isRead); static bool get_plan_nodes_walker(Node *query_node, XCWalkerContext *context); static bool examine_conditions_walker(Node *expr_node, XCWalkerContext *context); @@ -507,8 +507,9 @@ get_plan_nodes_insert(Query *query) * Get list of parent-child joins (partitioned together) * Get list of joins with replicated tables * - * If we encounter a cross-node join, we stop processing and return false, - * otherwise true. + * If we encounter an expression such as a cross-node join that cannot + * be easily handled in a single step, we stop processing and return true, + * otherwise false. * */ static bool @@ -780,6 +781,13 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) } } + /* See if the function is immutable, otherwise give up */ + if (IsA(expr_node, FuncExpr)) + { + if (!is_immutable_func(((FuncExpr*) expr_node)->funcid)) + return true; + } + /* Handle subquery */ if (IsA(expr_node, SubLink)) { @@ -2088,12 +2096,11 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) result->canSetTag = query->canSetTag; result->utilityStmt = query->utilityStmt; result->intoClause = query->intoClause; - result->rtable = query->rtable; query_step = makeNode(RemoteQuery); - query_step->is_single_step = false; + /* * Declare Cursor case: * We should leave as a step query only SELECT statement @@ -2210,6 +2217,13 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) return result; } + /* + * If there already is an active portal, we may be doing planning within a function. + * Just use the standard plan + */ + if (ActivePortal) + return standard_planner(query, cursorOptions, boundParams); + query_step->is_single_step = true; /* * PGXCTODO diff --git a/src/backend/pgxc/pool/postgresql_fdw.c b/src/backend/pgxc/pool/postgresql_fdw.c index 9e418be..dabf5da 100644 --- a/src/backend/pgxc/pool/postgresql_fdw.c +++ b/src/backend/pgxc/pool/postgresql_fdw.c @@ -44,7 +44,7 @@ /* deparse SQL from the request */ -static bool is_immutable_func(Oid funcid); +bool is_immutable_func(Oid funcid); static bool is_foreign_qual(ExprState *state); static bool foreign_qual_walker(Node *node, void *context); char *deparseSql(RemoteQueryState *scanstate); @@ -53,7 +53,7 @@ char *deparseSql(RemoteQueryState *scanstate); /* * Check whether the function is IMMUTABLE. */ -static bool +bool is_immutable_func(Oid funcid) { HeapTuple tp; diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h index 548e4cd..d2bac5a 100644 --- a/src/include/pgxc/planner.h +++ b/src/include/pgxc/planner.h @@ -148,4 +148,5 @@ extern PlannedStmt *pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams); extern bool IsHashDistributable(Oid col_type); +extern bool is_immutable_func(Oid funcid); #endif /* PGXCPLANNER_H */ ----------------------------------------------------------------------- Summary of changes: src/backend/pgxc/plan/planner.c | 24 +++++++++++++++++++----- src/backend/pgxc/pool/postgresql_fdw.c | 4 ++-- src/include/pgxc/planner.h | 1 + 3 files changed, 22 insertions(+), 7 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-09-26 07:51:36
|
Project "Postgres-XC". The branch, master has been updated via c3e87d496dbf75651197f03b36d1cf0ba4ea7f0c (commit) from ac66a8c598dfc601e64df04dba73dc6d99f78272 (commit) - Log ----------------------------------------------------------------- commit c3e87d496dbf75651197f03b36d1cf0ba4ea7f0c Author: Mason Sharp <ma...@us...> Date: Sun Sep 26 16:47:36 2010 +0900 Initial support for cursors (DECLARE, FETCH). This initial version implements support by creating them on the Coordinator only; they are not created on the data nodes. Not yet supported is UPDATE / DELETE WHERE CURRENT OF, but basic read-only cursor capability works, including SCROLL cursors. Written by Andrei Martsinchyk diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index eab1bd0..63031a7 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -1194,8 +1194,17 @@ slot_deform_datarow(TupleTableSlot *slot) errmsg("Tuple does not match the descriptor"))); if (slot->tts_attinmeta == NULL) + { + /* + * Ensure info about input functions is available as long as slot lives + */ + MemoryContext oldcontext = MemoryContextSwitchTo(slot->tts_mcxt); + slot->tts_attinmeta = TupleDescGetAttInMetadata(slot->tts_tupleDescriptor); + MemoryContextSwitchTo(oldcontext); + } + buffer = makeStringInfo(); for (i = 0; i < attnum; i++) { diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 657413a..772a6f7 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -847,7 +847,7 @@ DoCopy(const CopyStmt *stmt, const char *queryString) int num_phys_attrs; uint64 processed; #ifdef PGXC - Exec_Nodes *exec_nodes = NULL; + ExecNodes *exec_nodes = NULL; #endif /* Allocate workspace and zero all fields */ @@ -1138,7 +1138,7 @@ DoCopy(const CopyStmt *stmt, const char *queryString) { char *hash_att; - exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); + exec_nodes = makeNode(ExecNodes); /* * If target table does not exists on nodes (e.g. system table) diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 151fe33..c58e2a0 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -25,6 +25,10 @@ #include "nodes/plannodes.h" #include "nodes/relation.h" +#ifdef PGXC +#include "pgxc/locator.h" +#include "pgxc/planner.h" +#endif #include "utils/datum.h" @@ -809,6 +813,124 @@ _copyPlanInvalItem(PlanInvalItem *from) return newnode; } +#ifdef PGXC +/* + * _copyRemoteQuery + */ +static RemoteQuery * +_copyRemoteQuery(RemoteQuery *from) +{ + RemoteQuery *newnode = makeNode(RemoteQuery); + + /* + * copy node superclass fields + */ + CopyScanFields((Scan *) from, (Scan *) newnode); + + /* + * copy remainder of node + */ + COPY_SCALAR_FIELD(is_single_step); + COPY_STRING_FIELD(sql_statement); + COPY_NODE_FIELD(exec_nodes); + COPY_SCALAR_FIELD(combine_type); + COPY_NODE_FIELD(simple_aggregates); + COPY_NODE_FIELD(sort); + COPY_NODE_FIELD(distinct); + COPY_SCALAR_FIELD(read_only); + COPY_SCALAR_FIELD(force_autocommit); + + return newnode; +} + +/* + * _copyExecNodes + */ +static ExecNodes * +_copyExecNodes(ExecNodes *from) +{ + ExecNodes *newnode = makeNode(ExecNodes); + + COPY_NODE_FIELD(primarynodelist); + COPY_NODE_FIELD(nodelist); + COPY_SCALAR_FIELD(baselocatortype); + COPY_SCALAR_FIELD(tableusagetype); + + return newnode; +} + +/* + * _copySimpleAgg + */ +static SimpleAgg * +_copySimpleAgg(SimpleAgg *from) +{ + SimpleAgg *newnode = makeNode(SimpleAgg); + + COPY_SCALAR_FIELD(column_pos); + COPY_NODE_FIELD(aggref); + COPY_SCALAR_FIELD(transfn_oid); + COPY_SCALAR_FIELD(finalfn_oid); + COPY_SCALAR_FIELD(arginputfn); + COPY_SCALAR_FIELD(argioparam); + COPY_SCALAR_FIELD(resoutputfn); + COPY_SCALAR_FIELD(transfn); + COPY_SCALAR_FIELD(finalfn); + if (!from->initValueIsNull) + newnode->initValue = datumCopy(from->initValue, from->transtypeByVal, + from->transtypeLen); + COPY_SCALAR_FIELD(initValueIsNull); + COPY_SCALAR_FIELD(inputtypeLen); + COPY_SCALAR_FIELD(resulttypeLen); + COPY_SCALAR_FIELD(transtypeLen); + COPY_SCALAR_FIELD(inputtypeByVal); + COPY_SCALAR_FIELD(resulttypeByVal); + COPY_SCALAR_FIELD(transtypeByVal); + /* No need to copy runtime info, just init */ + newnode->collectValueNull = true; + initStringInfo(&newnode->valuebuf); + + return newnode; +} + +/* + * _copySimpleSort + */ +static SimpleSort * +_copySimpleSort(SimpleSort *from) +{ + SimpleSort *newnode = makeNode(SimpleSort); + + COPY_SCALAR_FIELD(numCols); + if (from->numCols > 0) + { + COPY_POINTER_FIELD(sortColIdx, from->numCols * sizeof(AttrNumber)); + COPY_POINTER_FIELD(sortOperators, from->numCols * sizeof(Oid)); + COPY_POINTER_FIELD(nullsFirst, from->numCols * sizeof(bool)); + } + + return newnode; +} + +/* + * _copySimpleDistinct + */ +static SimpleDistinct * +_copySimpleDistinct(SimpleDistinct *from) +{ + SimpleDistinct *newnode = makeNode(SimpleDistinct); + + COPY_SCALAR_FIELD(numCols); + if (from->numCols > 0) + { + COPY_POINTER_FIELD(uniqColIdx, from->numCols * sizeof(AttrNumber)); + COPY_POINTER_FIELD(eqOperators, from->numCols * sizeof(Oid)); + } + + return newnode; +} +#endif + /* **************************************************************** * primnodes.h copy functions * **************************************************************** @@ -3554,7 +3676,26 @@ copyObject(void *from) case T_PlanInvalItem: retval = _copyPlanInvalItem(from); break; - +#ifdef PGXC + /* + * PGXC SPECIFIC NODES + */ + case T_RemoteQuery: + retval = _copyRemoteQuery(from); + break; + case T_ExecNodes: + retval = _copyExecNodes(from); + break; + case T_SimpleAgg: + retval = _copySimpleAgg(from); + break; + case T_SimpleSort: + retval = _copySimpleSort(from); + break; + case T_SimpleDistinct: + retval = _copySimpleDistinct(from); + break; +#endif /* * PRIMITIVE NODES */ diff --git a/src/backend/pgxc/locator/locator.c b/src/backend/pgxc/locator/locator.c index 63c6359..debbc77 100644 --- a/src/backend/pgxc/locator/locator.c +++ b/src/backend/pgxc/locator/locator.c @@ -279,18 +279,18 @@ GetRoundRobinNode(Oid relid) * * The returned List is a copy, so it should be freed when finished. */ -Exec_Nodes * +ExecNodes * GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, int isRead) { ListCell *prefItem; ListCell *stepItem; - Exec_Nodes *exec_nodes; + ExecNodes *exec_nodes; if (rel_loc_info == NULL) return NULL; - exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); + exec_nodes = makeNode(ExecNodes); exec_nodes->baselocatortype = rel_loc_info->locatorType; switch (rel_loc_info->locatorType) diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 7fedbfb..a7bc0ab 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -20,6 +20,7 @@ #include "catalog/pg_namespace.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" +#include "executor/executor.h" #include "lib/stringinfo.h" #include "nodes/nodeFuncs.h" #include "nodes/nodes.h" @@ -120,7 +121,7 @@ typedef struct XCWalkerContext { Query *query; bool isRead; - Exec_Nodes *exec_nodes; /* resulting execution nodes */ + ExecNodes *exec_nodes; /* resulting execution nodes */ Special_Conditions *conditions; bool multilevel_join; List *rtables; /* a pointer to a list of rtables */ @@ -139,7 +140,7 @@ bool StrictStatementChecking = true; bool StrictSelectChecking = false; -static Exec_Nodes *get_plan_nodes(Query *query, bool isRead); +static ExecNodes *get_plan_nodes(Query *query, bool isRead); static bool get_plan_nodes_walker(Node *query_node, XCWalkerContext *context); static bool examine_conditions_walker(Node *expr_node, XCWalkerContext *context); static int handle_limit_offset(RemoteQuery *query_step, Query *query, PlannedStmt *plan_stmt); @@ -402,13 +403,13 @@ get_base_var(Var *var, XCWalkerContext *context) /* * get_plan_nodes_insert - determine nodes on which to execute insert. */ -static Exec_Nodes * +static ExecNodes * get_plan_nodes_insert(Query *query) { RangeTblEntry *rte; RelationLocInfo *rel_loc_info; Const *constant; - Exec_Nodes *exec_nodes; + ExecNodes *exec_nodes; ListCell *lc; long part_value; long *part_value_ptr = NULL; @@ -786,7 +787,7 @@ examine_conditions_walker(Node *expr_node, XCWalkerContext *context) bool is_multilevel; int save_parent_child_count = 0; SubLink *sublink = (SubLink *) expr_node; - Exec_Nodes *save_exec_nodes = context->exec_nodes; /* Save old exec_nodes */ + ExecNodes *save_exec_nodes = context->exec_nodes; /* Save old exec_nodes */ /* save parent-child count */ if (context->exec_nodes) @@ -940,9 +941,9 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) ListCell *lc, *item; RelationLocInfo *rel_loc_info; - Exec_Nodes *test_exec_nodes = NULL; - Exec_Nodes *current_nodes = NULL; - Exec_Nodes *from_query_nodes = NULL; + ExecNodes *test_exec_nodes = NULL; + ExecNodes *current_nodes = NULL; + ExecNodes *from_query_nodes = NULL; TableUsageType table_usage_type = TABLE_USAGE_TYPE_NO_TABLE; TableUsageType current_usage_type = TABLE_USAGE_TYPE_NO_TABLE; int from_subquery_count = 0; @@ -972,7 +973,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) if (contains_only_pg_catalog (query->rtable)) { /* just pg_catalog tables */ - context->exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); + context->exec_nodes = makeNode(ExecNodes); context->exec_nodes->tableusagetype = TABLE_USAGE_TYPE_PGCATALOG; context->exec_on_coord = true; return false; @@ -991,7 +992,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) if (rte->rtekind == RTE_SUBQUERY) { - Exec_Nodes *save_exec_nodes = context->exec_nodes; + ExecNodes *save_exec_nodes = context->exec_nodes; Special_Conditions *save_conditions = context->conditions; /* Save old conditions */ List *current_rtable = rte->subquery->rtable; @@ -1089,7 +1090,7 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) /* If we are just dealing with pg_catalog, just return */ if (table_usage_type == TABLE_USAGE_TYPE_PGCATALOG) { - context->exec_nodes = (Exec_Nodes *) palloc0(sizeof(Exec_Nodes)); + context->exec_nodes = makeNode(ExecNodes); context->exec_nodes->tableusagetype = TABLE_USAGE_TYPE_PGCATALOG; context->exec_on_coord = true; return false; @@ -1255,10 +1256,10 @@ get_plan_nodes_walker(Node *query_node, XCWalkerContext *context) * Top level entry point before walking query to determine plan nodes * */ -static Exec_Nodes * +static ExecNodes * get_plan_nodes(Query *query, bool isRead) { - Exec_Nodes *result_nodes = NULL; + ExecNodes *result_nodes = NULL; XCWalkerContext context; @@ -1293,10 +1294,10 @@ get_plan_nodes(Query *query, bool isRead) * * return NULL if it is not safe to be done in a single step. */ -static Exec_Nodes * +static ExecNodes * get_plan_nodes_command(Query *query) { - Exec_Nodes *exec_nodes = NULL; + ExecNodes *exec_nodes = NULL; switch (query->commandType) { @@ -1384,7 +1385,7 @@ get_simple_aggregates(Query * query) *finalfnexpr; Datum textInitVal; - simple_agg = (SimpleAgg *) palloc0(sizeof(SimpleAgg)); + simple_agg = makeNode(SimpleAgg); simple_agg->column_pos = column_pos; initStringInfo(&simple_agg->valuebuf); simple_agg->aggref = aggref; @@ -1759,7 +1760,7 @@ make_simple_sort_from_sortclauses(Query *query, RemoteQuery *step) nullsFirst = (bool *) palloc(numsortkeys * sizeof(bool)); numsortkeys = 0; - sort = (SimpleSort *) palloc(sizeof(SimpleSort)); + sort = makeNode(SimpleSort); if (sortcls) { @@ -1908,7 +1909,7 @@ make_simple_sort_from_sortclauses(Query *query, RemoteQuery *step) * extra_distincts list */ - distinct = (SimpleDistinct *) palloc(sizeof(SimpleDistinct)); + distinct = makeNode(SimpleDistinct); /* * We will need at most list_length(distinctcls) sort columns @@ -2093,12 +2094,50 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) query_step = makeNode(RemoteQuery); query_step->is_single_step = false; - query_step->sql_statement = pstrdup(query->sql_statement); + /* + * Declare Cursor case: + * We should leave as a step query only SELECT statement + * Further if we need refer source statement for planning we should take + * the truncated string + */ + if (query->utilityStmt && + IsA(query->utilityStmt, DeclareCursorStmt)) + { + + char *src = query->sql_statement; + char str[strlen(src) + 1]; /* mutable copy */ + char *dst = str; + + cursorOptions |= ((DeclareCursorStmt *) query->utilityStmt)->options; + + /* + * Initialize mutable copy, converting letters to uppercase and + * various witespace characters to spaces + */ + while (*src) + { + if (isspace(*src)) + { + src++; + *dst++ = ' '; + } + else + *dst++ = toupper(*src++); + } + *dst = '\0'; + /* search for SELECT keyword in the normalized string */ + dst = strstr(str, " SELECT "); + /* Take substring of the original string using found offset */ + query_step->sql_statement = pstrdup(query->sql_statement + (dst - str + 1)); + } + else + query_step->sql_statement = pstrdup(query->sql_statement); + query_step->exec_nodes = NULL; query_step->combine_type = COMBINE_TYPE_NONE; query_step->simple_aggregates = NULL; /* Optimize multi-node handling */ - query_step->read_only = query->nodeTag == T_SelectStmt; + query_step->read_only = query->commandType == CMD_SELECT; query_step->force_autocommit = false; result->planTree = (Plan *) query_step; @@ -2108,20 +2147,20 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) * level, Data Nodes, or both. By default we choose both. We should be * able to quickly expand this for more commands. */ - switch (query->nodeTag) + switch (query->commandType) { - case T_SelectStmt: + case CMD_SELECT: /* Perform some checks to make sure we can support the statement */ if (query->intoClause) ereport(ERROR, (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), (errmsg("INTO clause not yet supported")))); /* fallthru */ - case T_InsertStmt: - case T_UpdateStmt: - case T_DeleteStmt: + case CMD_INSERT: + case CMD_UPDATE: + case CMD_DELETE: /* Set result relations */ - if (query->nodeTag != T_SelectStmt) + if (query->commandType != CMD_SELECT) result->resultRelations = list_make1_int(query->resultRelation); query_step->exec_nodes = get_plan_nodes_command(query); @@ -2129,7 +2168,7 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) if (query_step->exec_nodes == NULL) { /* Do not yet allow multi-node correlated UPDATE or DELETE */ - if ((query->nodeTag == T_UpdateStmt || query->nodeTag == T_DeleteStmt)) + if (query->commandType == CMD_UPDATE || query->commandType == CMD_DELETE) { ereport(ERROR, (errcode(ERRCODE_STATEMENT_TOO_COMPLEX), @@ -2144,15 +2183,16 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) return result; } - if ((query->nodeTag == T_UpdateStmt || query->nodeTag == T_DeleteStmt) + /* Do not yet allow multi-node correlated UPDATE or DELETE */ + if ((query->commandType == CMD_UPDATE || query->commandType == CMD_DELETE) && !query_step->exec_nodes && list_length(query->rtable) > 1) { - result = standard_planner(query, cursorOptions, boundParams); - return result; + result = standard_planner(query, cursorOptions, boundParams); + return result; } - /* + /* * Use standard plan if we have more than one data node with either * group by, hasWindowFuncs, or hasRecursive */ @@ -2161,13 +2201,13 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) * group by expression is the partitioning column, in which * case it is ok to treat as a single step. */ - if (query->nodeTag == T_SelectStmt + if (query->commandType == CMD_SELECT && query_step->exec_nodes && list_length(query_step->exec_nodes->nodelist) > 1 && (query->groupClause || query->hasWindowFuncs || query->hasRecursive)) { - result = standard_planner(query, cursorOptions, boundParams); - return result; + result = standard_planner(query, cursorOptions, boundParams); + return result; } query_step->is_single_step = true; @@ -2191,9 +2231,9 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) query, query_step->exec_nodes->baselocatortype); /* Set up simple aggregates */ - /* PGXCTODO - we should detect what types of aggregates are used. + /* PGXCTODO - we should detect what types of aggregates are used. * in some cases we can avoid the final step and merely proxy results - * (when there is only one data node involved) instead of using + * (when there is only one data node involved) instead of using * coordinator consolidation. At the moment this is needed for AVG() */ query_step->simple_aggregates = get_simple_aggregates(query); @@ -2224,6 +2264,16 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) result->planTree = standardPlan; } + /* + * If creating a plan for a scrollable cursor, make sure it can run + * backwards on demand. Add a Material node at the top at need. + */ + if (cursorOptions & CURSOR_OPT_SCROLL) + { + if (!ExecSupportsBackwardScan(result->planTree)) + result->planTree = materialize_finished_plan(result->planTree); + } + return result; } diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index e7ac601..16d2f6b 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -1990,7 +1990,7 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_ * Send a data row to the specified nodes */ int -DataNodeCopyIn(char *data_row, int len, Exec_Nodes *exec_nodes, DataNodeHandle** copy_connections) +DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, DataNodeHandle** copy_connections) { DataNodeHandle *primary_handle = NULL; ListCell *nodeitem; @@ -2143,7 +2143,7 @@ DataNodeCopyIn(char *data_row, int len, Exec_Nodes *exec_nodes, DataNodeHandle** } uint64 -DataNodeCopyOut(Exec_Nodes *exec_nodes, DataNodeHandle** copy_connections, FILE* copy_file) +DataNodeCopyOut(ExecNodes *exec_nodes, DataNodeHandle** copy_connections, FILE* copy_file) { RemoteQueryState *combiner; int conn_count = list_length(exec_nodes->nodelist) == 0 ? NumDataNodes : list_length(exec_nodes->nodelist); @@ -2436,7 +2436,7 @@ copy_slot(RemoteQueryState *node, TupleTableSlot *src, TupleTableSlot *dst) } static void -get_exec_connections(Exec_Nodes *exec_nodes, +get_exec_connections(ExecNodes *exec_nodes, int *regular_conn_count, int *total_conn_count, DataNodeHandle ***connections, diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 84c70c6..528e4e1 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -658,7 +658,6 @@ pg_analyze_and_rewrite(Node *parsetree, const char *query_string, { Query *query = (Query *) lfirst(lc); query->sql_statement = pstrdup(query_string); - query->nodeTag = nodeTag(parsetree); } } #endif @@ -1318,7 +1317,6 @@ exec_parse_message(const char *query_string, /* string to execute */ { Query *query = (Query *) lfirst(lc); query->sql_statement = pstrdup(query_string); - query->nodeTag = nodeTag(raw_parse_tree); } } #endif diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 91456c4..db11abe 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -62,7 +62,7 @@ #include "pgxc/pgxc.h" #include "pgxc/planner.h" -static void ExecUtilityStmtOnNodes(const char *queryString, Exec_Nodes *nodes, +static void ExecUtilityStmtOnNodes(const char *queryString, ExecNodes *nodes, bool force_autocommit); #endif @@ -1367,7 +1367,7 @@ ProcessUtility(Node *parsetree, #ifdef PGXC static void -ExecUtilityStmtOnNodes(const char *queryString, Exec_Nodes *nodes, +ExecUtilityStmtOnNodes(const char *queryString, ExecNodes *nodes, bool force_autocommit) { RemoteQuery *step = makeNode(RemoteQuery); diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index a466239..8bb49c6 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -73,6 +73,13 @@ typedef enum NodeTag T_SetOp, T_Limit, #ifdef PGXC + /* + * TAGS FOR PGXC NODES (planner.h, locator.h) + */ + T_ExecNodes, + T_SimpleAgg, + T_SimpleSort, + T_SimpleDistinct, T_RemoteQuery, #endif /* this one isn't a subclass of Plan: */ diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index a367a6b..5fb2a2b 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -149,7 +149,6 @@ typedef struct Query #ifdef PGXC /* need this info for PGXC Planner, may be temporary */ char *sql_statement; /* original query */ - NodeTag nodeTag; /* node tag of top node of parse tree */ #endif } Query; diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index e2aebef..5ba8fff 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -85,8 +85,8 @@ extern void DataNodeRollbackPrepared(char *gid); extern void DataNodeCommitPrepared(char *gid); extern DataNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_from); -extern int DataNodeCopyIn(char *data_row, int len, Exec_Nodes *exec_nodes, DataNodeHandle** copy_connections); -extern uint64 DataNodeCopyOut(Exec_Nodes *exec_nodes, DataNodeHandle** copy_connections, FILE* copy_file); +extern int DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, DataNodeHandle** copy_connections); +extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, DataNodeHandle** copy_connections, FILE* copy_file); extern void DataNodeCopyFinish(DataNodeHandle** copy_connections, int primary_data_node, CombineType combine_type); extern int ExecCountSlotsRemoteQuery(RemoteQuery *node); diff --git a/src/include/pgxc/locator.h b/src/include/pgxc/locator.h index 7ae0474..233bf26 100644 --- a/src/include/pgxc/locator.h +++ b/src/include/pgxc/locator.h @@ -61,11 +61,12 @@ typedef enum */ typedef struct { + NodeTag type; List *primarynodelist; List *nodelist; char baselocatortype; TableUsageType tableusagetype; /* track pg_catalog usage */ -} Exec_Nodes; +} ExecNodes; extern char *PreferredDataNodes; @@ -77,7 +78,7 @@ extern char ConvertToLocatorType(int disttype); extern char *GetRelationHashColumn(RelationLocInfo *rel_loc_info); extern RelationLocInfo *GetRelationLocInfo(Oid relid); extern RelationLocInfo *CopyRelationLocInfo(RelationLocInfo *src_info); -extern Exec_Nodes *GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, +extern ExecNodes *GetRelationNodes(RelationLocInfo *rel_loc_info, long *partValue, int isRead); extern bool IsHashColumn(RelationLocInfo *rel_loc_info, char *part_col_name); extern bool IsHashColumnForRelId(Oid relid, char *part_col_name); diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h index 346dd65..548e4cd 100644 --- a/src/include/pgxc/planner.h +++ b/src/include/pgxc/planner.h @@ -38,6 +38,7 @@ typedef enum */ typedef struct { + NodeTag type; int numCols; /* number of sort-key columns */ AttrNumber *sortColIdx; /* their indexes in the target list */ Oid *sortOperators; /* OIDs of operators to sort them by */ @@ -47,6 +48,7 @@ typedef struct /* For returning distinct results from the RemoteQuery*/ typedef struct { + NodeTag type; int numCols; /* number of sort-key columns */ AttrNumber *uniqColIdx; /* their indexes in the target list */ Oid *eqOperators; /* OIDs of operators to equate them by */ @@ -61,7 +63,7 @@ typedef struct Scan scan; bool is_single_step; /* special case, skip extra work */ char *sql_statement; - Exec_Nodes *exec_nodes; + ExecNodes *exec_nodes; CombineType combine_type; List *simple_aggregates; /* simple aggregate to combine on this step */ SimpleSort *sort; @@ -87,6 +89,7 @@ typedef enum /* For handling simple aggregates */ typedef struct { + NodeTag type; int column_pos; /* Only use 1 for now */ Aggref *aggref; Oid transfn_oid; ----------------------------------------------------------------------- Summary of changes: src/backend/access/common/heaptuple.c | 9 ++ src/backend/commands/copy.c | 4 +- src/backend/nodes/copyfuncs.c | 143 ++++++++++++++++++++++++++++++++- src/backend/pgxc/locator/locator.c | 6 +- src/backend/pgxc/plan/planner.c | 122 ++++++++++++++++++++-------- src/backend/pgxc/pool/execRemote.c | 6 +- src/backend/tcop/postgres.c | 2 - src/backend/tcop/utility.c | 4 +- src/include/nodes/nodes.h | 7 ++ src/include/nodes/parsenodes.h | 1 - src/include/pgxc/execRemote.h | 4 +- src/include/pgxc/locator.h | 5 +- src/include/pgxc/planner.h | 5 +- 13 files changed, 263 insertions(+), 55 deletions(-) hooks/post-receive -- Postgres-XC |
From: mason_s <ma...@us...> - 2010-09-22 00:45:00
|
Project "Postgres-XC". The branch, master has been updated via ac66a8c598dfc601e64df04dba73dc6d99f78272 (commit) from ba79eded1dfbfabc51b3de4931b638853d13a30d (commit) - Log ----------------------------------------------------------------- commit ac66a8c598dfc601e64df04dba73dc6d99f78272 Author: Mason Sharp <ma...@us...> Date: Tue Sep 21 20:41:41 2010 -0400 Address performance issues that were introduced in the last couple of months. We avoid sending down BEGIN to the data nodes for SELECT if it is not needed. We avoid going through the standard PostgreSQL planner on the coordinator if unnecessary, for simple single-step statements. Remove extra limit node that appeared in the plan, though it did no limiting. diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 847b556..519ea4f 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -1009,6 +1009,9 @@ InitPlan(QueryDesc *queryDesc, int eflags) } else { +#ifdef PGXC + if (!IS_PGXC_COORDINATOR) +#endif if (operation == CMD_INSERT) ExecCheckPlanOutput(estate->es_result_relation_info->ri_RelationDesc, planstate->plan->targetlist); diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index e18e813..7fedbfb 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -1979,6 +1979,9 @@ handle_limit_offset(RemoteQuery *query_step, Query *query, PlannedStmt *plan_stm { /* check if no special handling needed */ + if (!query->limitCount && !query->limitOffset) + return 0; + if (query_step && query_step->exec_nodes && list_length(query_step->exec_nodes->nodelist) <= 1) return 0; @@ -2071,16 +2074,23 @@ handle_limit_offset(RemoteQuery *query_step, Query *query, PlannedStmt *plan_stm PlannedStmt * pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) { - /* - * We waste some time invoking standard planner, but getting good enough - * PlannedStmt, we just need to replace standard plan. - * In future we may want to skip the standard_planner invocation and - * initialize the PlannedStmt here. At the moment not all queries works: - * ex. there was a problem with INSERT into a subset of table columns - */ - PlannedStmt *result = standard_planner(query, cursorOptions, boundParams); - Plan *standardPlan = result->planTree; - RemoteQuery *query_step = makeNode(RemoteQuery); + PlannedStmt *result; + Plan *standardPlan; + RemoteQuery *query_step; + + + /* build the PlannedStmt result */ + result = makeNode(PlannedStmt); + + /* Try and set what we can */ + result->commandType = query->commandType; + result->canSetTag = query->canSetTag; + result->utilityStmt = query->utilityStmt; + result->intoClause = query->intoClause; + + result->rtable = query->rtable; + + query_step = makeNode(RemoteQuery); query_step->is_single_step = false; query_step->sql_statement = pstrdup(query->sql_statement); @@ -2110,6 +2120,10 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) case T_InsertStmt: case T_UpdateStmt: case T_DeleteStmt: + /* Set result relations */ + if (query->nodeTag != T_SelectStmt) + result->resultRelations = list_make1_int(query->resultRelation); + query_step->exec_nodes = get_plan_nodes_command(query); if (query_step->exec_nodes == NULL) @@ -2124,18 +2138,35 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) /* * Processing guery against catalog tables, or multi-step command. - * Restore standard plan + * Run through standard planner */ - result->planTree = standardPlan; + result = standard_planner(query, cursorOptions, boundParams); return result; } - /* Do not yet allow multi-node correlated UPDATE or DELETE */ if ((query->nodeTag == T_UpdateStmt || query->nodeTag == T_DeleteStmt) && !query_step->exec_nodes && list_length(query->rtable) > 1) { - result->planTree = standardPlan; + result = standard_planner(query, cursorOptions, boundParams); + return result; + } + + /* + * Use standard plan if we have more than one data node with either + * group by, hasWindowFuncs, or hasRecursive + */ + /* + * PGXCTODO - this could be improved to check if the first + * group by expression is the partitioning column, in which + * case it is ok to treat as a single step. + */ + if (query->nodeTag == T_SelectStmt + && query_step->exec_nodes + && list_length(query_step->exec_nodes->nodelist) > 1 + && (query->groupClause || query->hasWindowFuncs || query->hasRecursive)) + { + result = standard_planner(query, cursorOptions, boundParams); return result; } @@ -2153,7 +2184,7 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) * then call standard planner and take targetList from the plan * generated by Postgres. */ - query_step->scan.plan.targetlist = standardPlan->targetlist; + query_step->scan.plan.targetlist = query->targetList; if (query_step->exec_nodes) query_step->combine_type = get_plan_combine_type( @@ -2174,32 +2205,15 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) (query->sortClause || query->distinctClause)) make_simple_sort_from_sortclauses(query, query_step); - /* Handle LIMIT and OFFSET for single-step queries on multiple nodes*/ + /* Handle LIMIT and OFFSET for single-step queries on multiple nodes */ if (handle_limit_offset(query_step, query, result)) { /* complicated expressions, just fallback to standard plan */ - result->planTree = standardPlan; + result = standard_planner(query, cursorOptions, boundParams); return result; } - - /* - * Use standard plan if we have more than one data node with either - * group by, hasWindowFuncs, or hasRecursive - */ - /* - * PGXCTODO - this could be improved to check if the first - * group by expression is the partitioning column, in which - * case it is ok to treat as a single step. - */ - if (query->nodeTag == T_SelectStmt - && query_step->exec_nodes - && list_length(query_step->exec_nodes->nodelist) > 1 - && (query->groupClause || query->hasWindowFuncs || query->hasRecursive)) - { - result->planTree = standardPlan; - return result; - } break; + default: /* Allow for override */ if (StrictStatementChecking) diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index e7ef66e..e7ac601 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -998,8 +998,8 @@ handle_response(DataNodeHandle * conn, RemoteQueryState *combiner) if (conn->state == DN_CONNECTION_STATE_QUERY) return RESPONSE_EOF; - /* - * If we are in the process of shutting down, we + /* + * If we are in the process of shutting down, we * may be rolling back, and the buffer may contain other messages. * We want to avoid a procarray exception * as well as an error stack overflow. @@ -1745,7 +1745,7 @@ finish: /* * Rollback current transaction - * This will happen + * This will happen */ int DataNodeRollback(void) @@ -2577,7 +2577,7 @@ ExecRemoteQuery(RemoteQueryState *node) if (force_autocommit) need_tran = false; else - need_tran = !autocommit || total_conn_count > 1; + need_tran = !autocommit || !is_read_only && total_conn_count > 1; elog(DEBUG1, "autocommit = %s, has primary = %s, regular_conn_count = %d, need_tran = %s", autocommit ? "true" : "false", primaryconnection ? "true" : "false", regular_conn_count, need_tran ? "true" : "false"); @@ -3143,7 +3143,7 @@ DataNodeConsumeMessages(void) pfree(connections); } - + /* ---------------------------------------------------------------- * ExecRemoteQueryReScan * ----------------------------------------------------------------------- Summary of changes: src/backend/executor/execMain.c | 3 + src/backend/pgxc/plan/planner.c | 84 +++++++++++++++++++++--------------- src/backend/pgxc/pool/execRemote.c | 10 ++-- 3 files changed, 57 insertions(+), 40 deletions(-) hooks/post-receive -- Postgres-XC |
From: Michael P. <mic...@us...> - 2010-09-14 23:36:21
|
Project "Postgres-XC". The branch, master has been updated via ba79eded1dfbfabc51b3de4931b638853d13a30d (commit) from 19a8fa536779653524a1feb862c18277efa317f4 (commit) - Log ----------------------------------------------------------------- commit ba79eded1dfbfabc51b3de4931b638853d13a30d Author: Michael P <mic...@us...> Date: Wed Sep 15 08:26:30 2010 +0900 Implementation of 2PC from applications Support for PREPARE TRANSACTION 'tid', ROLLBACK PREPARED 'tid' and COMMIT PREPARED 'tid'. When a Transaction is prepared on a Coordinator, the list of involved Datanodes is saved in GTM and transaction is put in PREPARE state. The transaction ID 'tid' is also saved on GTM. COMMIT PREPARED or ROLLBACK PREPARED can be issued from a different Coordinator by using the same tid. The Coordinator receiving the Commit SQL gets a list of Datanodes from GTM, and commits the transaction on the right nodes. This patch adds a new interface on GTM to save also the list of Coordinators involved in a PREPARE transaction. Coordinator<->Coordinator connection protocol is not implemented yet, so for the moment Coordinator do not create a 2PC file at PREPARE. This feature will be added with the implementation of DDL synchronization among Coordinators. diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index c7f3547..08ed2c9 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -122,7 +122,8 @@ CommitTranGTM(GlobalTransactionId gxid) CheckConnection(); ret = commit_transaction(conn, gxid); - /* If something went wrong (timeout), try and reset GTM connection. + /* + * If something went wrong (timeout), try and reset GTM connection. * We will close the transaction locally anyway, and closing GTM will force * it to be closed on GTM. */ @@ -134,6 +135,34 @@ CommitTranGTM(GlobalTransactionId gxid) return ret; } +/* + * For a prepared transaction, commit the gxid used for PREPARE TRANSACTION + * and for COMMIT PREPARED. + */ +int +CommitPreparedTranGTM(GlobalTransactionId gxid, GlobalTransactionId prepared_gxid) +{ + int ret = 0; + + if (!GlobalTransactionIdIsValid(gxid) || !GlobalTransactionIdIsValid(prepared_gxid)) + return ret; + CheckConnection(); + ret = commit_prepared_transaction(conn, gxid, prepared_gxid); + + /* + * If something went wrong (timeout), try and reset GTM connection. + * We will close the transaction locally anyway, and closing GTM will force + * it to be closed on GTM. + */ + + if (ret < 0) + { + CloseGTM(); + InitGTM(); + } + return ret; +} + int RollbackTranGTM(GlobalTransactionId gxid) { @@ -144,7 +173,37 @@ RollbackTranGTM(GlobalTransactionId gxid) CheckConnection(); ret = abort_transaction(conn, gxid); - /* If something went wrong (timeout), try and reset GTM connection. + /* + * If something went wrong (timeout), try and reset GTM connection. + * We will abort the transaction locally anyway, and closing GTM will force + * it to end on GTM. + */ + if (ret < 0) + { + CloseGTM(); + InitGTM(); + } + return ret; +} + +int +BeingPreparedTranGTM(GlobalTransactionId gxid, + char *gid, + int datanodecnt, + PGXC_NodeId datanodes[], + int coordcnt, + PGXC_NodeId coordinators[]) +{ + int ret = 0; + + if (!GlobalTransactionIdIsValid(gxid)) + return 0; + CheckConnection(); + + ret = being_prepared_transaction(conn, gxid, gid, datanodecnt, datanodes, coordcnt, coordinators); + + /* + * If something went wrong (timeout), try and reset GTM connection. * We will abort the transaction locally anyway, and closing GTM will force * it to end on GTM. */ @@ -153,6 +212,61 @@ RollbackTranGTM(GlobalTransactionId gxid) CloseGTM(); InitGTM(); } + + return ret; +} + +int +PrepareTranGTM(GlobalTransactionId gxid) +{ + int ret; + + if (!GlobalTransactionIdIsValid(gxid)) + return 0; + CheckConnection(); + ret = prepare_transaction(conn, gxid); + + /* + * If something went wrong (timeout), try and reset GTM connection. + * We will close the transaction locally anyway, and closing GTM will force + * it to be closed on GTM. + */ + if (ret < 0) + { + CloseGTM(); + InitGTM(); + } + return ret; +} + + +int +GetGIDDataGTM(char *gid, + GlobalTransactionId *gxid, + GlobalTransactionId *prepared_gxid, + int *datanodecnt, + PGXC_NodeId **datanodes, + int *coordcnt, + PGXC_NodeId **coordinators) +{ + int ret = 0; + + CheckConnection(); + ret = get_gid_data(conn, GTM_ISOLATION_RC, gid, gxid, + prepared_gxid, datanodecnt, datanodes, + coordcnt, coordinators); + + /* + * If something went wrong (timeout), try and reset GTM connection. + * We will abort the transaction locally anyway, and closing GTM will force + * it to end on GTM. + */ + if (ret < 0) + { + CloseGTM(); + InitGTM(); + } + return ret; } diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index e8cf1bf..d881078 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -929,6 +929,11 @@ EndPrepare(GlobalTransaction gxact) * critical section, though, it doesn't matter since any failure causes * PANIC anyway. */ +#ifdef PGXC + /* Do not write 2PC state file on Coordinator side */ + if (IS_PGXC_DATANODE) + { +#endif TwoPhaseFilePath(path, xid); fd = BasicOpenFile(path, @@ -1001,6 +1006,9 @@ EndPrepare(GlobalTransaction gxact) * We save the PREPARE record's location in the gxact for later use by * CheckPointTwoPhase. */ +#ifdef PGXC + } +#endif START_CRIT_SECTION(); MyProc->inCommit = true; @@ -1011,6 +1019,12 @@ EndPrepare(GlobalTransaction gxact) /* If we crash now, we have prepared: WAL replay will fix things */ +#ifdef PGXC + /* Just write 2PC state file on Datanodes */ + if (IS_PGXC_DATANODE) + { +#endif + /* write correct CRC and close file */ if ((write(fd, &statefile_crc, sizeof(pg_crc32))) != sizeof(pg_crc32)) { @@ -1024,6 +1038,9 @@ EndPrepare(GlobalTransaction gxact) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close two-phase state file: %m"))); +#ifdef PGXC + } +#endif /* * Mark the prepared transaction as valid. As soon as xact.c marks MyProc @@ -1875,3 +1892,16 @@ RecordTransactionAbortPrepared(TransactionId xid, END_CRIT_SECTION(); } + +#ifdef PGXC +/* + * Remove a gxact on a Coordinator, + * this is used to be able to prepare a commit transaction on another coordinator than the one + * who prepared the transaction + */ +void +RemoveGXactCoord(GlobalTransaction gxact) +{ + RemoveGXact(gxact); +} +#endif diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 8a946cc..458068c 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2133,6 +2133,17 @@ PrepareTransaction(void) PostPrepare_Locks(xid); +#ifdef PGXC + /* + * We want to be able to commit a prepared transaction from another coordinator, + * so clean up the gxact in shared memory also. + */ + if (IS_PGXC_COORDINATOR) + { + RemoveGXactCoord(gxact); + } +#endif + ResourceOwnerRelease(TopTransactionResourceOwner, RESOURCE_RELEASE_LOCKS, true, true); diff --git a/src/backend/pgxc/pool/datanode.c b/src/backend/pgxc/pool/datanode.c index 31b5bc0..2e8ec40 100644 --- a/src/backend/pgxc/pool/datanode.c +++ b/src/backend/pgxc/pool/datanode.c @@ -1105,6 +1105,25 @@ get_transaction_nodes(DataNodeHandle **connections) } /* + * Collect node numbers for the given Datanode connections + * and return it for prepared transactions + */ +PGXC_NodeId* +collect_datanode_numbers(int conn_count, DataNodeHandle **connections) +{ + PGXC_NodeId *datanodes = NULL; + int i; + datanodes = (PGXC_NodeId *) palloc(conn_count * sizeof(PGXC_NodeId)); + + for (i = 0; i < conn_count; i++) + { + datanodes[i] = connections[i]->nodenum; + } + + return datanodes; +} + +/* * Return those node connections that appear to be active and * have data to consume on them. */ diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 05dbe2e..e7ef66e 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -52,6 +52,14 @@ static int data_node_begin(int conn_count, DataNodeHandle ** connections, GlobalTransactionId gxid); static int data_node_commit(int conn_count, DataNodeHandle ** connections); static int data_node_rollback(int conn_count, DataNodeHandle ** connections); +static int data_node_prepare(int conn_count, DataNodeHandle ** connections, + char *gid); +static int data_node_rollback_prepared(GlobalTransactionId gxid, GlobalTransactionId prepared_gxid, + int conn_count, DataNodeHandle ** connections, + char *gid); +static int data_node_commit_prepared(GlobalTransactionId gxid, GlobalTransactionId prepared_gxid, + int conn_count, DataNodeHandle ** connections, + char *gid); static void clear_write_node_list(); @@ -531,6 +539,7 @@ HandleCommandComplete(RemoteQueryState *combiner, char *msg_body, size_t len) else combiner->combine_type = COMBINE_TYPE_NONE; } + combiner->command_complete_count++; } @@ -793,6 +802,7 @@ validate_combiner(RemoteQueryState *combiner) /* Check if state is defined */ if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED) return false; + /* Check all nodes completed */ if ((combiner->request_type == REQUEST_TYPE_COMMAND || combiner->request_type == REQUEST_TYPE_QUERY) @@ -1205,6 +1215,389 @@ DataNodeBegin(void) /* + * Prepare transaction on Datanodes involved in current transaction. + * GXID associated to current transaction has to be committed on GTM. + */ +int +DataNodePrepare(char *gid) +{ + int res = 0; + int tran_count; + DataNodeHandle *connections[NumDataNodes]; + + /* gather connections to prepare */ + tran_count = get_transaction_nodes(connections); + + /* + * If we do not have open transactions we have nothing to prepare just + * report success + */ + if (tran_count == 0) + { + elog(WARNING, "Nothing to PREPARE on Datanodes, gid is not used"); + goto finish; + } + + /* TODO: data_node_prepare */ + res = data_node_prepare(tran_count, connections, gid); + +finish: + /* + * The transaction is just prepared, but Datanodes have reset, + * so we'll need a new gxid for commit prepared or rollback prepared + * Application is responsible for delivering the correct gid. + * Release the connections for the moment. + */ + if (!autocommit) + stat_transaction(tran_count); + if (!PersistentConnections) + release_handles(false); + autocommit = true; + clear_write_node_list(); + return res; +} + + +/* + * Prepare transaction on dedicated nodes with gid received from application + */ +static int +data_node_prepare(int conn_count, DataNodeHandle ** connections, char *gid) +{ + int i; + int result = 0; + struct timeval *timeout = NULL; + char *buffer = (char *) palloc0(22 + strlen(gid) + 1); + RemoteQueryState *combiner = NULL; + GlobalTransactionId gxid = InvalidGlobalTransactionId; + PGXC_NodeId *datanodes = NULL; + + gxid = GetCurrentGlobalTransactionId(); + + /* + * Now that the transaction has been prepared on the nodes, + * Initialize to make the business on GTM + */ + datanodes = collect_datanode_numbers(conn_count, connections); + + /* + * Send a Prepare in Progress message to GTM. + * At the same time node list is saved on GTM. + */ + result = BeingPreparedTranGTM(gxid, gid, conn_count, datanodes, 0, NULL); + + if (result < 0) + return EOF; + + sprintf(buffer, "PREPARE TRANSACTION '%s'", gid); + + /* Send PREPARE */ + for (i = 0; i < conn_count; i++) + if (data_node_send_query(connections[i], buffer)) + return EOF; + + combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); + + /* Receive responses */ + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + return EOF; + + result = ValidateAndCloseCombiner(combiner) ? result : EOF; + if (result) + goto finish; + + /* + * Prepare the transaction on GTM after everything is done. + * GXID associated with PREPARE state is considered as used on Nodes, + * but is still present in Snapshot. + * This GXID will be discarded from Snapshot when commit prepared is + * issued from another node. + */ + result = PrepareTranGTM(gxid); + +finish: + /* + * An error has happened on a Datanode or GTM, + * It is necessary to rollback the transaction on already prepared nodes. + * But not on nodes where the error occurred. + */ + if (result) + { + GlobalTransactionId rollback_xid = InvalidGlobalTransactionId; + buffer = (char *) repalloc(buffer, 20 + strlen(gid) + 1); + + sprintf(buffer, "ROLLBACK PREPARED '%s'", gid); + + rollback_xid = BeginTranGTM(NULL); + for (i = 0; i < conn_count; i++) + { + if (data_node_send_gxid(connections[i], rollback_xid)) + { + add_error_message(connections[i], "Can not send request"); + return EOF; + } + if (data_node_send_query(connections[i], buffer)) + { + add_error_message(connections[i], "Can not send request"); + return EOF; + } + } + + if (!combiner) + combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); + + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + result = EOF; + result = ValidateAndCloseCombiner(combiner) ? result : EOF; + + /* + * Don't forget to rollback also on GTM + * Both GXIDs used for PREPARE and COMMIT PREPARED are discarded from GTM snapshot here. + */ + CommitPreparedTranGTM(gxid, rollback_xid); + + return EOF; + } + + return result; +} + + +/* + * Commit prepared transaction on Datanodes where it has been prepared. + * Connection to backends has been cut when transaction has been prepared, + * So it is necessary to send the COMMIT PREPARE message to all the nodes. + * We are not sure if the transaction prepared has involved all the datanodes + * or not but send the message to all of them. + * This avoid to have any additional interaction with GTM when making a 2PC transaction. + */ +void +DataNodeCommitPrepared(char *gid) +{ + int res = 0; + int res_gtm = 0; + DataNodeHandle **connections; + List *nodelist = NIL; + int i, tran_count; + PGXC_NodeId *datanodes = NULL; + PGXC_NodeId *coordinators = NULL; + int coordcnt = 0; + int datanodecnt = 0; + GlobalTransactionId gxid, prepared_gxid; + + res_gtm = GetGIDDataGTM(gid, &gxid, &prepared_gxid, + &datanodecnt, &datanodes, &coordcnt, &coordinators); + + tran_count = datanodecnt + coordcnt; + if (tran_count == 0 || res_gtm < 0) + goto finish; + + autocommit = false; + + /* Build the list of nodes based on data received from GTM */ + for (i = 0; i < datanodecnt; i++) + { + nodelist = lappend_int(nodelist,datanodes[i]); + } + + /* Get connections */ + connections = get_handles(nodelist); + + /* Commit here the prepared transaction to all Datanodes */ + res = data_node_commit_prepared(gxid, prepared_gxid, datanodecnt, connections, gid); + +finish: + /* In autocommit mode statistics is collected in DataNodeExec */ + if (!autocommit) + stat_transaction(tran_count); + if (!PersistentConnections) + release_handles(false); + autocommit = true; + clear_write_node_list(); + + /* Free node list taken from GTM */ + if (datanodes) + free(datanodes); + if (coordinators) + free(coordinators); + + if (res_gtm < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not get GID data from GTM"))); + if (res != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not commit prepared transaction on data nodes"))); +} + +/* + * Commit a prepared transaction on all nodes + * Prepared transaction with this gid has reset the datanodes, + * so we need a new gxid. + * An error is returned to the application only if all the Datanodes + * and Coordinator do not know about the gxid proposed. + * This permits to avoid interactions with GTM. + */ +static int +data_node_commit_prepared(GlobalTransactionId gxid, GlobalTransactionId prepared_gxid, int conn_count, DataNodeHandle ** connections, char *gid) +{ + int result = 0; + int i; + RemoteQueryState *combiner = NULL; + struct timeval *timeout = NULL; + char *buffer = (char *) palloc0(18 + strlen(gid) + 1); + + /* GXID has been piggybacked when gid data has been received from GTM */ + sprintf(buffer, "COMMIT PREPARED '%s'", gid); + + /* Send gxid and COMMIT PREPARED message to all the Datanodes */ + for (i = 0; i < conn_count; i++) + { + if (data_node_send_gxid(connections[i], gxid)) + { + add_error_message(connections[i], "Can not send request"); + result = EOF; + goto finish; + } + if (data_node_send_query(connections[i], buffer)) + { + add_error_message(connections[i], "Can not send request"); + result = EOF; + goto finish; + } + } + + combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); + + /* Receive responses */ + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + result = EOF; + + /* Validate and close combiner */ + result = ValidateAndCloseCombiner(combiner) ? result : EOF; + +finish: + /* Both GXIDs used for PREPARE and COMMIT PREPARED are discarded from GTM snapshot here */ + CommitPreparedTranGTM(gxid, prepared_gxid); + + return result; +} + +/* + * Rollback prepared transaction on Datanodes involved in the current transaction + */ +void +DataNodeRollbackPrepared(char *gid) +{ + int res = 0; + int res_gtm = 0; + DataNodeHandle **connections; + List *nodelist = NIL; + int i, tran_count; + + PGXC_NodeId *datanodes = NULL; + PGXC_NodeId *coordinators = NULL; + int coordcnt = 0; + int datanodecnt = 0; + GlobalTransactionId gxid, prepared_gxid; + + res_gtm = GetGIDDataGTM(gid, &gxid, &prepared_gxid, + &datanodecnt, &datanodes, &coordcnt, &coordinators); + + tran_count = datanodecnt + coordcnt; + if (tran_count == 0 || res_gtm < 0 ) + goto finish; + + autocommit = false; + + /* Build the node list based on the result got from GTM */ + for (i = 0; i < datanodecnt; i++) + { + nodelist = lappend_int(nodelist,datanodes[i]); + } + + /* Get connections */ + connections = get_handles(nodelist); + + /* Here do the real rollback to Datanodes */ + res = data_node_rollback_prepared(gxid, prepared_gxid, datanodecnt, connections, gid); + +finish: + /* In autocommit mode statistics is collected in DataNodeExec */ + if (!autocommit) + stat_transaction(tran_count); + if (!PersistentConnections) + release_handles(true); + autocommit = true; + clear_write_node_list(true); + if (res_gtm < 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not get GID data from GTM"))); + if (res != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not rollback prepared transaction on Datanodes"))); +} + + +/* + * Rollback prepared transaction + * We first get the prepared informations from GTM and then do the treatment + * At the end both prepared GXID and GXID are committed. + */ +static int +data_node_rollback_prepared(GlobalTransactionId gxid, GlobalTransactionId prepared_gxid, + int conn_count, DataNodeHandle ** connections, char *gid) +{ + int result = 0; + int i; + RemoteQueryState *combiner = NULL; + struct timeval *timeout = NULL; + char *buffer = (char *) palloc0(20 + strlen(gid) + 1); + + /* Datanodes have reset after prepared state, so get a new gxid */ + gxid = BeginTranGTM(NULL); + + sprintf(buffer, "ROLLBACK PREPARED '%s'", gid); + + /* Send gxid and COMMIT PREPARED message to all the Datanodes */ + for (i = 0; i < conn_count; i++) + { + if (data_node_send_gxid(connections[i], gxid)) + { + add_error_message(connections[i], "Can not send request"); + result = EOF; + goto finish; + } + + if (data_node_send_query(connections[i], buffer)) + { + add_error_message(connections[i], "Can not send request"); + result = EOF; + goto finish; + } + } + + combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); + + /* Receive responses */ + if (data_node_receive_responses(conn_count, connections, timeout, combiner)) + result = EOF; + + /* Validate and close combiner */ + result = ValidateAndCloseCombiner(combiner) ? result : EOF; + +finish: + /* Both GXIDs used for PREPARE and COMMIT PREPARED are discarded from GTM snapshot here */ + CommitPreparedTranGTM(gxid, prepared_gxid); + + return result; +} + + +/* * Commit current transaction on data nodes where it has been started */ void diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 0c6208c..91456c4 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -337,6 +337,15 @@ ProcessUtility(Node *parsetree, break; case TRANS_STMT_PREPARE: +#ifdef PGXC + /* + * If 2PC if invoked from application, transaction is first prepared on Datanodes. + * 2PC file is not written for Coordinators to keep the possiblity + * of a COMMIT PREPARED on a separate Coordinator + */ + if (IS_PGXC_COORDINATOR) + DataNodePrepare(stmt->gid); +#endif if (!PrepareTransactionBlock(stmt->gid)) { /* report unsuccessful commit in completionTag */ @@ -346,13 +355,46 @@ ProcessUtility(Node *parsetree, break; case TRANS_STMT_COMMIT_PREPARED: +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + DataNodeCommitPrepared(stmt->gid); +#endif PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); + +#ifdef PGXC + if (IS_PGXC_DATANODE) + { + /* + * 2PC file of Coordinator is not flushed to disk when transaction is prepared + * so just skip this part. + */ +#endif FinishPreparedTransaction(stmt->gid, true); +#ifdef PGXC + } +#endif break; case TRANS_STMT_ROLLBACK_PREPARED: +#ifdef PGXC + if (IS_PGXC_COORDINATOR) + DataNodeRollbackPrepared(stmt->gid); +#endif + PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); + +#ifdef PGXC + if (IS_PGXC_DATANODE) + { + /* + * 2PC file of Coordinator is not flushed to disk when transaction is prepared + * so just skip this part. + */ +#endif FinishPreparedTransaction(stmt->gid, false); +#ifdef PGXC + } +#endif break; case TRANS_STMT_ROLLBACK: diff --git a/src/gtm/client/fe-protocol.c b/src/gtm/client/fe-protocol.c index 0847b0d..ff73b8d 100644 --- a/src/gtm/client/fe-protocol.c +++ b/src/gtm/client/fe-protocol.c @@ -362,19 +362,18 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) break; case TXN_BEGIN_GETGXID_AUTOVACUUM_RESULT: case TXN_PREPARE_RESULT: + case TXN_BEING_PREPARED_RESULT: if (gtmpqGetnchar((char *)&result->gr_resdata.grd_gxid, sizeof (GlobalTransactionId), conn)) result->gr_status = -1; break; case TXN_COMMIT_RESULT: + case TXN_COMMIT_PREPARED_RESULT: case TXN_ROLLBACK_RESULT: if (gtmpqGetnchar((char *)&result->gr_resdata.grd_gxid, sizeof (GlobalTransactionId), conn)) - { result->gr_status = -1; - break; - } break; case TXN_GET_GXID_RESULT: @@ -531,6 +530,60 @@ gtmpqParseSuccess(GTM_Conn *conn, GTM_Result *result) case TXN_GET_ALL_PREPARED_RESULT: break; + case TXN_GET_GID_DATA_RESULT: + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_get_gid_data.gxid, + sizeof (GlobalTransactionId), conn)) + { + result->gr_status = -1; + break; + } + if (gtmpqGetnchar((char *)&result->gr_resdata.grd_txn_get_gid_data.prepared_gxid, + sizeof (GlobalTransactionId), conn)) + { + result->gr_status = -1; + break; + } + if (gtmpqGetInt(&result->gr_resdata.grd_txn_get_gid_data.datanodecnt, + sizeof (int32), conn)) + { + result->gr_status = -1; + break; + } + if ((result->gr_resdata.grd_txn_get_gid_data.datanodes = (PGXC_NodeId *) + malloc(sizeof(PGXC_NodeId) * result->gr_resdata.grd_txn_get_gid_data.datanodecnt)) == NULL) + { + result->gr_status = -1; + break; + } + if (gtmpqGetnchar((char *)result->gr_resdata.grd_txn_get_gid_data.datanodes, + sizeof(PGXC_NodeId) * result->gr_resdata.grd_txn_get_gid_data.datanodecnt, conn)) + { + result->gr_status = -1; + break; + } + if (gtmpqGetInt(&result->gr_resdata.grd_txn_get_gid_data.coordcnt, + sizeof (int32), conn)) + { + result->gr_status = -1; + break; + } + if (result->gr_resdata.grd_txn_get_gid_data.coordcnt != 0) + { + if ((result->gr_resdata.grd_txn_get_gid_data.coordinators = (PGXC_NodeId *) + malloc(sizeof(PGXC_NodeId) * result->gr_resdata.grd_txn_get_gid_data.coordcnt)) == NULL) + { + result->gr_status = -1; + break; + } + if (gtmpqGetnchar((char *)result->gr_resdata.grd_txn_get_gid_data.coordinators, + sizeof(PGXC_NodeId) * result->gr_resdata.grd_txn_get_gid_data.coordcnt, conn)) + { + result->gr_status = -1; + break; + } + } + break; + default: printfGTMPQExpBuffer(&conn->errorMessage, "unexpected result type from server; result typr was \"%d\"\n", diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c index 35f81ae..54b75fd 100644 --- a/src/gtm/client/gtm_client.c +++ b/src/gtm/client/gtm_client.c @@ -135,6 +135,7 @@ receive_failed: send_failed: return InvalidGlobalTransactionId; } + int commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid) { @@ -175,7 +176,48 @@ commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid) receive_failed: send_failed: return -1; +} + +int +commit_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, GlobalTransactionId prepared_gxid) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_TXN_COMMIT_PREPARED, sizeof (GTM_MessageType), conn) || + gtmpqPutc(true, conn) || + gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn) || + gtmpqPutc(true, conn) || + gtmpqPutnchar((char *)&prepared_gxid, sizeof (GlobalTransactionId), conn)) + goto send_failed; + + /* Finish the message */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backends gets it */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (res->gr_status == 0) + { + Assert(res->gr_type == TXN_COMMIT_PREPARED_RESULT); + Assert(res->gr_resdata.grd_gxid == gxid); + } +send_failed: +receive_failed: + return -1; } int @@ -222,19 +264,71 @@ send_failed: } int -prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid, - int nodecnt, PGXC_NodeId nodes[]) +being_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, char *gid, + int datanodecnt, PGXC_NodeId datanodes[], int coordcnt, + PGXC_NodeId coordinators[]) { GTM_Result *res = NULL; time_t finish_time; /* Start the message. */ if (gtmpqPutMsgStart('C', true, conn) || - gtmpqPutInt(MSG_TXN_PREPARE, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(MSG_TXN_BEING_PREPARED, sizeof (GTM_MessageType), conn) || gtmpqPutc(true, conn) || gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn) || - gtmpqPutInt(nodecnt, sizeof (int), conn) || - gtmpqPutnchar((char *)nodes, sizeof (PGXC_NodeId) * nodecnt, conn)) + /* Send also GID for an explicit prepared transaction */ + gtmpqPutInt(strlen(gid), sizeof (GTM_GIDLen), conn) || + gtmpqPutnchar((char *) gid, strlen(gid), conn) || + gtmpqPutInt(datanodecnt, sizeof (int), conn) || + gtmpqPutnchar((char *)datanodes, sizeof (PGXC_NodeId) * datanodecnt, conn) || + gtmpqPutInt(coordcnt, sizeof (int), conn)) + goto send_failed; + + /* Coordinator connections are not always involved in a transaction */ + if (coordcnt != 0 && gtmpqPutnchar((char *)coordinators, sizeof (PGXC_NodeId) * coordcnt, conn)) + goto send_failed; + + /* Finish the message. */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (res->gr_status == 0) + { + Assert(res->gr_type == TXN_BEING_PREPARED_RESULT); + Assert(res->gr_resdata.grd_gxid == gxid); + } + + return res->gr_status; + +receive_failed: +send_failed: + return -1; +} + + +int +prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid) +{ + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message. */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_TXN_PREPARE, sizeof (GTM_MessageType), conn) || + gtmpqPutc(true, conn) || + gtmpqPutnchar((char *)&gxid, sizeof (GlobalTransactionId), conn)) goto send_failed; /* Finish the message. */ @@ -266,6 +360,64 @@ send_failed: return -1; } +int +get_gid_data(GTM_Conn *conn, + GTM_IsolationLevel isolevel, + char *gid, + GlobalTransactionId *gxid, + GlobalTransactionId *prepared_gxid, + int *datanodecnt, + PGXC_NodeId **datanodes, + int *coordcnt, + PGXC_NodeId **coordinators) +{ + bool txn_read_only = false; + GTM_Result *res = NULL; + time_t finish_time; + + /* Start the message */ + if (gtmpqPutMsgStart('C', true, conn) || + gtmpqPutInt(MSG_TXN_GET_GID_DATA, sizeof (GTM_MessageType), conn) || + gtmpqPutInt(isolevel, sizeof (GTM_IsolationLevel), conn) || + gtmpqPutc(txn_read_only, conn) || + /* Send also GID for an explicit prepared transaction */ + gtmpqPutInt(strlen(gid), sizeof (GTM_GIDLen), conn) || + gtmpqPutnchar((char *) gid, strlen(gid), conn)) + goto send_failed; + + /* Finish the message */ + if (gtmpqPutMsgEnd(conn)) + goto send_failed; + + /* Flush to ensure backend gets it. */ + if (gtmpqFlush(conn)) + goto send_failed; + + finish_time = time(NULL) + CLIENT_GTM_TIMEOUT; + if (gtmpqWaitTimed(true, false, conn, finish_time) || + gtmpqReadData(conn) < 0) + goto receive_failed; + + if ((res = GTMPQgetResult(conn)) == NULL) + goto receive_failed; + + if (res->gr_status == 0) + { + *gxid = res->gr_resdata.grd_txn_get_gid_data.gxid; + *prepared_gxid = res->gr_resdata.grd_txn_get_gid_data.prepared_gxid; + *datanodes = res->gr_resdata.grd_txn_get_gid_data.datanodes; + *coordinators = res->gr_resdata.grd_txn_get_gid_data.coordinators; + *datanodecnt = res->gr_resdata.grd_txn_get_gid_data.datanodecnt; + *coordcnt = res->gr_resdata.grd_txn_get_gid_data.coordcnt; + } + + return res->gr_status; + +receive_failed: +send_failed: + return -1; +} + /* * Snapshot Management API */ diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c index 2205167..949c123 100644 --- a/src/gtm/main/gtm_txn.c +++ b/src/gtm/main/gtm_txn.c @@ -149,6 +149,35 @@ GTM_GXIDToHandle(GlobalTransactionId gxid) } /* + * Given the GID (for a prepared transaction), find the corresponding + * transaction handle. + */ +GTM_TransactionHandle +GTM_GIDToHandle(char *gid) +{ + ListCell *elem = NULL; + GTM_TransactionInfo *gtm_txninfo = NULL; + + GTM_RWLockAcquire(>MTransactions.gt_TransArrayLock, GTM_LOCKMODE_READ); + + foreach(elem, GTMTransactions.gt_open_transactions) + { + gtm_txninfo = (GTM_TransactionInfo *)lfirst(elem); + if (gtm_txninfo->gti_gid && strcmp(gid,gtm_txninfo->gti_gid) == 0) + break; + gtm_txninfo = NULL; + } + + GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); + + if (gtm_txninfo != NULL) + return gtm_txninfo->gti_handle; + else + return InvalidTransactionHandle; +} + + +/* * Given the transaction handle, find the corresponding transaction info * structure * @@ -159,7 +188,7 @@ GTM_GXIDToHandle(GlobalTransactionId gxid) GTM_TransactionInfo * GTM_HandleToTransactionInfo(GTM_TransactionHandle handle) { - GTM_TransactionInfo *gtm_txninfo = NULL; + GTM_TransactionInfo *gtm_txninfo = NULL; if ((handle < 0) || (handle > GTM_MAX_GLOBAL_TRANSACTIONS)) { @@ -180,6 +209,7 @@ GTM_HandleToTransactionInfo(GTM_TransactionHandle handle) return gtm_txninfo; } + /* * Remove the given transaction info structures from the global array. If the * calling thread does not have enough cached structures, we in fact keep the @@ -220,9 +250,27 @@ GTM_RemoveTransInfoMulti(GTM_TransactionInfo *gtm_txninfo[], int txn_count) * Now mark the transaction as aborted and mark the structure as not-in-use */ gtm_txninfo[ii]->gti_state = GTM_TXN_ABORTED; - gtm_txninfo[ii]->gti_nodecount = 0; + gtm_txninfo[ii]->gti_datanodecount = 0; + gtm_txninfo[ii]->gti_coordcount = 0; gtm_txninfo[ii]->gti_in_use = false; gtm_txninfo[ii]->gti_snapshot_set = false; + + /* Clean-up also structures that were used for prepared transactions */ + if (gtm_txninfo[ii]->gti_gid) + { + pfree(gtm_txninfo[ii]->gti_gid); + gtm_txninfo[ii]->gti_gid = NULL; + } + if (gtm_txninfo[ii]->gti_coordinators) + { + pfree(gtm_txninfo[ii]->gti_coordinators); + gtm_txninfo[ii]->gti_coordinators = NULL; + } + if (gtm_txninfo[ii]->gti_datanodes) + { + pfree(gtm_txninfo[ii]->gti_datanodes); + gtm_txninfo[ii]->gti_datanodes = NULL; + } } GTM_RWLockRelease(>MTransactions.gt_TransArrayLock); @@ -252,15 +300,21 @@ GTM_RemoveAllTransInfos(int backend_id) while (cell != NULL) { GTM_TransactionInfo *gtm_txninfo = lfirst(cell); - /* check if current entry is associated with the thread */ + /* + * Check if current entry is associated with the thread + * A transaction in prepared state has to be kept alive in the structure. + * It will be committed by another thread than this one. + */ if ((gtm_txninfo->gti_in_use) && + (gtm_txninfo->gti_state != GTM_TXN_PREPARED) && + (gtm_txninfo->gti_state != GTM_TXN_PREPARE_IN_PROGRESS) && (gtm_txninfo->gti_thread_id == thread_id) && ((gtm_txninfo->gti_backend_id == backend_id) || (backend_id == -1))) { /* remove the entry */ GTMTransactions.gt_open_transactions = list_delete_cell(GTMTransactions.gt_open_transactions, cell, prev); - /* update the latestComletedXid */ + /* update the latestCompletedXid */ if (GlobalTransactionIdIsNormal(gtm_txninfo->gti_gxid) && GlobalTransactionIdFollowsOrEquals(gtm_txninfo->gti_gxid, GTMTransactions.gt_latestCompletedXid)) @@ -272,10 +326,27 @@ GTM_RemoveAllTransInfos(int backend_id) * Now mark the transaction as aborted and mark the structure as not-in-use */ gtm_txninfo->gti_state = GTM_TXN_ABORTED; - gtm_txninfo->gti_nodecount = 0; + gtm_txninfo->gti_datanodecount = 0; + gtm_txninfo->gti_coordcount = 0; gtm_txninfo->gti_in_use = false; gtm_txninfo->gti_snapshot_set = false; - + + if (gtm_txninfo->gti_gid) + { + pfree(gtm_txninfo->gti_gid); + gtm_txninfo->gti_gid = NULL; + } + if (gtm_txninfo->gti_coordinators) + { + pfree(gtm_txninfo->gti_coordinators); + gtm_txninfo->gti_coordinators = NULL; + } + if (gtm_txninfo->gti_datanodes) + { + pfree(gtm_txninfo->gti_datanodes); + gtm_txninfo->gti_datanodes = NULL; + } + /* move to next cell in the list */ if (prev) cell = lnext(prev); @@ -583,7 +654,7 @@ GTM_BeginTransactionMulti(GTM_CoordinatorId coord_id, * without removing the corresponding references from the global array */ oldContext = MemoryContextSwitchTo(TopMostMemoryContext); - + for (kk = 0; kk < txn_count; kk++) { int ii, jj, startslot; @@ -627,10 +698,16 @@ GTM_BeginTransactionMulti(GTM_CoordinatorId coord_id, gtm_txninfo[kk]->gti_backend_id = connid[kk]; gtm_txninfo[kk]->gti_in_use = true; + gtm_txninfo[kk]->gti_coordcount = 0; + gtm_txninfo[kk]->gti_datanodes = 0; + gtm_txninfo[kk]->gti_gid = NULL; + gtm_txninfo[kk]->gti_coordinators = NULL; + gtm_txninfo[kk]->gti_datanodes = NULL; + gtm_txninfo[kk]->gti_handle = ii; gtm_txninfo[kk]->gti_vacuum = false; gtm_txninfo[kk]->gti_thread_id = pthread_self(); - GTMTransactions.gt_lastslot = ii; + GTMTransactions.gt_lastslot = ii; txns[kk] = ii; @@ -761,6 +838,29 @@ GTM_CommitTransactionMulti(GTM_TransactionHandle txn[], int txn_count, int statu } /* + * Prepare a transaction + */ +int +GTM_PrepareTransaction(GTM_TransactionHandle txn) +{ + GTM_TransactionInfo *gtm_txninfo = NULL; + + gtm_txninfo = GTM_HandleToTransactionInfo(txn); + + if (gtm_txninfo == NULL) + return STATUS_ERROR; + + /* + * Mark the transaction as prepared + */ + GTM_RWLockAcquire(>m_txninfo->gti_lock, GTM_LOCKMODE_WRITE); + gtm_txninfo->gti_state = GTM_TXN_PREPARED; + GTM_RWLockRelease(>m_txninfo->gti_lock); + + return STATUS_OK; +} + +/* * Commit a transaction */ int @@ -775,9 +875,12 @@ GTM_CommitTransaction(GTM_TransactionHandle txn) * Prepare a transaction */ int -GTM_PrepareTransaction(GTM_TransactionHandle txn, - uint32 nodecnt, - PGXC_NodeId nodes[]) +GTM_BeingPreparedTransaction(GTM_TransactionHandle txn, + char *gid, + uint32 datanodecnt, + PGXC_NodeId datanodes[], + uint32 coordcnt, + PGXC_NodeId coordinators[]) { GTM_TransactionInfo *gtm_txninfo = GTM_HandleToTransactionInfo(txn); @@ -785,15 +888,27 @@ GTM_PrepareTransaction(GTM_TransactionHandle txn, return STATUS_ERROR; /* - * Mark the transaction as being aborted + * Mark the transaction as being prepared */ GTM_RWLockAcquire(>m_txninfo->gti_lock, GTM_LOCKMODE_WRITE); - + gtm_txninfo->gti_state = GTM_TXN_PREPARE_IN_PROGRESS; - gtm_txninfo->gti_nodecount = nodecnt; - if (gtm_txninfo->gti_nodes == NULL) - gtm_txninfo->gti_nodes = (PGXC_NodeId *)MemoryContextAlloc(TopMostMemoryContext, sizeof (PGXC_NodeId) * GTM_MAX_2PC_NODES); - memcpy(gtm_txninfo->gti_nodes, nodes, sizeof (PGXC_NodeId) * nodecnt); + gtm_txninfo->gti_datanodecount = datanodecnt; + gtm_txninfo->gti_coordcount = coordcnt; + + if (gtm_txninfo->gti_datanodes == NULL) + gtm_txninfo->gti_datanodes = (PGXC_NodeId *)MemoryContextAlloc(TopMostMemoryContext, sizeof (PGXC_NodeId) * GTM_MAX_2PC_NODES); + memcpy(gtm_txninfo->gti_datanodes, datanodes, sizeof (PGXC_NodeId) * datanodecnt); + + /* It is possible that no coordinator is involved in a transaction */ + if (coordcnt != 0 && gtm_txninfo->gti_coordinators == NULL) + gtm_txninfo->gti_coordinators = (PGXC_NodeId *)MemoryContextAlloc(TopMostMemoryContext, sizeof (PGXC_NodeId) * GTM_MAX_2PC_NODES); + if (coordcnt != 0) + memcpy(gtm_txninfo->gti_coordinators, coordinators, sizeof (PGXC_NodeId) * coordcnt); + + if (gtm_txninfo->gti_gid == NULL) + gtm_txninfo->gti_gid = (char *)MemoryContextAlloc(TopMostMemoryContext, GTM_MAX_GID_LEN); + memcpy(gtm_txninfo->gti_gid, gid, strlen(gid)); GTM_RWLockRelease(>m_txninfo->gti_lock); @@ -804,12 +919,53 @@ GTM_PrepareTransaction(GTM_TransactionHandle txn, * Same as GTM_PrepareTransaction but takes GXID as input */ int -GTM_PrepareTransactionGXID(GlobalTransactionId gxid, - uint32 nodecnt, - PGXC_NodeId nodes[]) +GTM_BeingPreparedTransactionGXID(GlobalTransactionId gxid, + char *gid, + uint32 datanodecnt, + PGXC_NodeId datanodes[], + uint32 coordcnt, + PGXC_NodeId coordinators[]) { GTM_TransactionHandle txn = GTM_GXIDToHandle(gxid); - return GTM_PrepareTransaction(txn, nodecnt, nodes); + return GTM_BeingPreparedTransaction(txn, gid, datanodecnt, datanodes, coordcnt, coordinators); +} + +int +GTM_GetGIDData(GTM_TransactionHandle prepared_txn, + GlobalTransactionId *prepared_gxid, + int *datanodecnt, + PGXC_NodeId **datanodes, + int *coordcnt, + PGXC_NodeId **coordinators) +{ + GTM_TransactionInfo *gtm_txninfo = NULL; + MemoryContext oldContext; + + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + gtm_txninfo = GTM_HandleToTransactionInfo(prepared_txn); + if (gtm_txninfo == NULL) + return STATUS_ERROR; + + /* then get the necessary Data */ + *prepared_gxid = gtm_txninfo->gti_gxid; + *datanodecnt = gtm_txninfo->gti_datanodecount; + *coordcnt = gtm_txninfo->gti_coordcount; + + *datanodes = (PGXC_NodeId *) palloc(sizeof (PGXC_NodeId) * gtm_txninfo->gti_datanodecount); + memcpy(*datanodes, gtm_txninfo->gti_datanodes, + sizeof (PGXC_NodeId) * gtm_txninfo->gti_datanodecount); + + if (coordcnt != 0) + { + *coordinators = (PGXC_NodeId *) palloc(sizeof (PGXC_NodeId) * gtm_txninfo->gti_coordcount); + memcpy(*coordinators, gtm_txninfo->gti_coordinators, + sizeof (PGXC_NodeId) * gtm_txninfo->gti_coordcount); + } + + MemoryContextSwitchTo(oldContext); + + return STATUS_OK; } /* @@ -1146,6 +1302,174 @@ ProcessCommitTransactionCommand(Port *myport, StringInfo message) } /* + * Process MSG_TXN_COMMIT_PREPARED_MSG + * Commit a prepared transaction + * Here the GXID used for PREPARE and COMMIT PREPARED are both committed + */ +void +ProcessCommitPreparedTransactionCommand(Port *myport, StringInfo message) +{ + StringInfoData buf; + int txn_count = 2; /* PREPARE and COMMIT PREPARED gxid's */ + GTM_TransactionHandle txn[txn_count]; + GlobalTransactionId gxid[txn_count]; + MemoryContext oldContext; + int status[txn_count]; + int isgxid[txn_count]; + int ii, count; + + for (ii = 0; ii < txn_count; ii++) + { + isgxid[ii] = pq_getmsgbyte(message); + if (isgxid[ii]) + { + const char *data = pq_getmsgbytes(message, sizeof (gxid[ii])); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&gxid[ii], data, sizeof (gxid[ii])); + txn[ii] = GTM_GXIDToHandle(gxid[ii]); + elog(DEBUG1, "ProcessCommitTransactionCommandMulti: gxid(%u), handle(%u)", gxid[ii], txn[ii]); + } + else + { + const char *data = pq_getmsgbytes(message, sizeof (txn[ii])); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&txn[ii], data, sizeof (txn[ii])); + elog(DEBUG1, "ProcessCommitTransactionCommandMulti: handle(%u)", txn[ii]); + } + } + + pq_getmsgend(message); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* + * Commit the prepared transaction. + */ + count = GTM_CommitTransactionMulti(txn, txn_count, status); + + MemoryContextSwitchTo(oldContext); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_COMMIT_PREPARED_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&gxid[0], sizeof(GlobalTransactionId)); + pq_sendint(&buf, status[0], 4); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + + +/* + * Process MSG_TXN_GET_GID_DATA + * This message is used after at the beginning of a COMMIT PREPARED + * or a ROLLBACK PREPARED. + * For a given GID the following info is returned: + * - a fresh GXID, + * - GXID of the transaction that made the prepare + * - datanode and coordinator node list involved in the prepare + */ +void +ProcessGetGIDDataTransactionCommand(Port *myport, StringInfo message) +{ + StringInfoData buf; + char *gid; + int gidlen; + GTM_IsolationLevel txn_isolation_level; + bool txn_read_only; + MemoryContext oldContext; + GTM_TransactionHandle txn, prepared_txn; + /* Data to be sent back to client */ + GlobalTransactionId gxid, prepared_gxid; + PGXC_NodeId *coordinators = NULL; + PGXC_NodeId *datanodes = NULL; + int datanodecnt,coordcnt; + + /* take the isolation level and read_only instructions */ + txn_isolation_level = pq_getmsgint(message, sizeof (GTM_IsolationLevel)); + txn_read_only = pq_getmsgbyte(message); + + /* receive GID */ + gidlen = pq_getmsgint(message, sizeof (GTM_GIDLen)); + gid = (char *)pq_getmsgbytes(message, gidlen); + + pq_getmsgend(message); + + prepared_txn = GTM_GIDToHandle(gid); + if (prepared_txn == InvalidTransactionHandle) + ereport(ERROR, + (EINVAL, + errmsg("Failed to get GID Data for prepared transaction"))); + + oldContext = MemoryContextSwitchTo(TopMemoryContext); + + /* First get the GXID for the new transaction */ + txn = GTM_BeginTransaction(0, txn_isolation_level, txn_read_only); + if (txn == InvalidTransactionHandle) + ereport(ERROR, + (EINVAL, + errmsg("Failed to start a new transaction"))); + + gxid = GTM_GetGlobalTransactionId(txn); + if (gxid == InvalidGlobalTransactionId) + ereport(ERROR, + (EINVAL, + errmsg("Failed to get a new transaction id"))); + + /* + * Make the internal process, get the prepared information from GID. + */ + if (GTM_GetGIDData(prepared_txn, &prepared_gxid, &datanodecnt, &datanodes, &coordcnt, &coordinators) != STATUS_OK) + { + ereport(ERROR, + (EINVAL, + errmsg("Failed to get the information of prepared transaction"))); + } + + MemoryContextSwitchTo(oldContext); + + /* + * Send a SUCCESS message back to the client + */ + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_GET_GID_DATA_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + /* Send the two GXIDs */ + pq_sendbytes(&buf, (char *)&gxid, sizeof(GlobalTransactionId)); + pq_sendbytes(&buf, (char *)&prepared_gxid, sizeof(GlobalTransactionId)); + /* Then send the data linked to nodes involved in prepare */ + pq_sendint(&buf, datanodecnt, 4); + pq_sendbytes(&buf, (char *)datanodes, sizeof(PGXC_NodeId) * datanodecnt); + pq_sendint(&buf, coordcnt, 4); + if (coordcnt != 0) + pq_sendbytes(&buf, (char *)coordinators, sizeof(PGXC_NodeId) * coordcnt); + + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + +/* * Process MSG_TXN_ROLLBACK message */ void @@ -1352,18 +1676,21 @@ ProcessRollbackTransactionCommandMulti(Port *myport, StringInfo message) } /* - * Process MSG_TXN_PREPARE message + * Process MSG_TXN_BEING_PREPARED message */ void -ProcessPrepareTransactionCommand(Port *myport, StringInfo message) +ProcessBeingPreparedTransactionCommand(Port *myport, StringInfo message) { StringInfoData buf; GTM_TransactionHandle txn; GlobalTransactionId gxid; int isgxid = 0; - int nodecnt; - PGXC_NodeId *nodes; + int datanodecnt,coordcnt; + GTM_GIDLen gidlen; + PGXC_NodeId *coordinators = NULL; + PGXC_NodeId *datanodes = NULL; MemoryContext oldContext; + char *gid; isgxid = pq_getmsgbyte(message); @@ -1387,26 +1714,104 @@ ProcessPrepareTransactionCommand(Port *myport, StringInfo message) memcpy(&txn, data, sizeof (txn)); } - nodecnt = pq_getmsgint(message, sizeof (nodecnt)); - nodes = (PGXC_NodeId *) palloc(sizeof (PGXC_NodeId) * nodecnt); - memcpy(nodes, pq_getmsgbytes(message, sizeof (PGXC_NodeId) * nodecnt), - sizeof (PGXC_NodeId) * nodecnt); + /* get GID */ + gidlen = pq_getmsgint(message, sizeof (GTM_GIDLen)); + gid = (char *)pq_getmsgbytes(message, gidlen); + /* Get Datanode Data */ + datanodecnt = pq_getmsgint(message, 4); + datanodes = (PGXC_NodeId *) palloc(sizeof (PGXC_NodeId) * datanodecnt); + memcpy(datanodes, pq_getmsgbytes(message, sizeof (PGXC_NodeId) * datanodecnt), + sizeof (PGXC_NodeId) * datanodecnt); + + /* Get Coordinator Data, can be possibly NULL */ + coordcnt = pq_getmsgint(message, 4); + if (coordcnt != 0) + { + coordinators = (PGXC_NodeId *) palloc(sizeof (PGXC_NodeId) * coordcnt); + memcpy(coordinators, pq_getmsgbytes(message, sizeof (PGXC_NodeId) * coordcnt), + sizeof (PGXC_NodeId) * coordcnt); + } pq_getmsgend(message); - oldContext = MemoryContextSwitchTo(TopMemoryContext); + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); /* * Prepare the transaction */ - if (GTM_PrepareTransaction(txn, nodecnt, nodes) != STATUS_OK) + if (GTM_BeingPreparedTransaction(txn, gid, datanodecnt, datanodes, coordcnt, coordinators) != STATUS_OK) ereport(ERROR, (EINVAL, - errmsg("Failed to commit the transaction"))); + errmsg("Failed to prepare the transaction"))); MemoryContextSwitchTo(oldContext); - pfree(nodes); + if (datanodes) + pfree(datanodes); + if (coordinators) + pfree(coordinators); + + pq_beginmessage(&buf, 'S'); + pq_sendint(&buf, TXN_BEING_PREPARED_RESULT, 4); + if (myport->is_proxy) + { + GTM_ProxyMsgHeader proxyhdr; + proxyhdr.ph_conid = myport->conn_id; + pq_sendbytes(&buf, (char *)&proxyhdr, sizeof (GTM_ProxyMsgHeader)); + } + pq_sendbytes(&buf, (char *)&gxid, sizeof(GlobalTransactionId)); + pq_endmessage(myport, &buf); + + if (!myport->is_proxy) + pq_flush(myport); + return; +} + +/* + * Process MSG_TXN_PREPARE message + */ +void +ProcessPrepareTransactionCommand(Port *myport, StringInfo message) +{ + StringInfoData buf; + GTM_TransactionHandle txn; + GlobalTransactionId gxid; + int isgxid = 0; + MemoryContext oldContext; + int status = STATUS_OK; + + isgxid = pq_getmsgbyte(message); + + if (isgxid) + { + const char *data = pq_getmsgbytes(message, sizeof (gxid)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid GXID"))); + memcpy(&gxid, data, sizeof (gxid)); + txn = GTM_GXIDToHandle(gxid); + } + else + { + const char *data = pq_getmsgbytes(message, sizeof (txn)); + if (data == NULL) + ereport(ERROR, + (EPROTO, + errmsg("Message does not contain valid Transaction Handle"))); + memcpy(&txn, data, sizeof (txn)); + } + + pq_getmsgend(message); + + oldContext = MemoryContextSwitchTo(TopMostMemoryContext); + + /* + * Commit the transaction + */ + status = GTM_PrepareTransaction(txn); + + MemoryContextSwitchTo(oldContext); pq_beginmessage(&buf, 'S'); pq_sendint(&buf, TXN_PREPARE_RESULT, 4); @@ -1424,6 +1829,7 @@ ProcessPrepareTransactionCommand(Port *myport, StringInfo message) return; } + /* * Process MSG_TXN_GET_GXID message */ diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 667967a..1a6e546 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -769,12 +769,15 @@ ProcessCommand(Port *myport, StringInfo input_message) case MSG_TXN_BEGIN_GETGXID: case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: case MSG_TXN_PREPARE: + case MSG_TXN_BEING_PREPARED: case MSG_TXN_COMMIT: + case MSG_TXN_COMMIT_PREPARED: case MSG_TXN_ROLLBACK: case MSG_TXN_GET_GXID: case MSG_TXN_BEGIN_GETGXID_MULTI: case MSG_TXN_COMMIT_MULTI: case MSG_TXN_ROLLBACK_MULTI: + case MSG_TXN_GET_GID_DATA: ProcessTransactionCommand(myport, mtype, input_message); break; @@ -795,7 +798,7 @@ ProcessCommand(Port *myport, StringInfo input_message) case MSG_SEQUENCE_ALTER: ProcessSequenceCommand(myport, mtype, input_message); break; - + case MSG_TXN_GET_STATUS: case MSG_TXN_GET_ALL_PREPARED: ProcessQueryCommand(myport, mtype, input_message); @@ -938,39 +941,47 @@ ProcessTransactionCommand(Port *myport, GTM_MessageType mtype, StringInfo messag switch (mtype) { - case MSG_TXN_BEGIN: + case MSG_TXN_BEGIN: ProcessBeginTransactionCommand(myport, message); break; - case MSG_TXN_BEGIN_GETGXID: + case MSG_TXN_BEGIN_GETGXID: ProcessBeginTransactionGetGXIDCommand(myport, message); break; - case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: + case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: ProcessBeginTransactionGetGXIDAutovacuumCommand(myport, message); break; - case MSG_TXN_BEGIN_GETGXID_MULTI: + case MSG_TXN_BEGIN_GETGXID_MULTI: ProcessBeginTransactionGetGXIDCommandMulti(myport, message); break; - case MSG_TXN_PREPARE: + case MSG_TXN_BEING_PREPARED: + ProcessBeingPreparedTransactionCommand(myport, message); + break; + + case MSG_TXN_PREPARE: ProcessPrepareTransactionCommand(myport, message); break; - case MSG_TXN_COMMIT: + case MSG_TXN_COMMIT: ProcessCommitTransactionCommand(myport, message); break; - case MSG_TXN_ROLLBACK: + case MSG_TXN_COMMIT_PREPARED: + ProcessCommitPreparedTransactionCommand(myport, message); + break; + + case MSG_TXN_ROLLBACK: ProcessRollbackTransactionCommand(myport, message); break; - case MSG_TXN_COMMIT_MULTI: + case MSG_TXN_COMMIT_MULTI: ProcessCommitTransactionCommandMulti(myport, message); break; - case MSG_TXN_ROLLBACK_MULTI: + case MSG_TXN_ROLLBACK_MULTI: ProcessRollbackTransactionCommandMulti(myport, message); break; @@ -978,6 +989,9 @@ ProcessTransactionCommand(Port *myport, GTM_MessageType mtype, StringInfo messag ProcessGetGXIDTransactionCommand(myport, message); break; + case MSG_TXN_GET_GID_DATA: + ProcessGetGIDDataTransactionCommand(myport, message); + default: Assert(0); /* Shouldn't come here.. keep compiler quite */ } diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index 66b1594..d9ca329 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -949,9 +949,12 @@ ProcessCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, case MSG_TXN_BEGIN_GETGXID: case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: case MSG_TXN_PREPARE: + case MSG_TXN_BEING_PREPARED: case MSG_TXN_COMMIT: + case MSG_TXN_COMMIT_PREPARED: case MSG_TXN_ROLLBACK: case MSG_TXN_GET_GXID: + case MSG_TXN_GET_GID_DATA: ProcessTransactionCommand(conninfo, gtm_conn, mtype, input_message); break; @@ -1115,7 +1118,11 @@ ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo, case MSG_TXN_BEGIN: case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: case MSG_TXN_PREPARE: + case MSG_TXN_BEING_PREPARED: + /* There are not so many 2PC from application messages, so just proxy it. */ + case MSG_TXN_COMMIT_PREPARED: case MSG_TXN_GET_GXID: + case MSG_TXN_GET_GID_DATA: case MSG_SNAPSHOT_GXID_GET: case MSG_SEQUENCE_INIT: case MSG_SEQUENCE_GET_CURRENT: @@ -1165,8 +1172,6 @@ ProcessResponse(GTMProxy_ThreadInfo *thrinfo, GTMProxy_CommandInfo *cmdinfo, errmsg("invalid frontend message type %d", cmdinfo->ci_mtype))); } - - } /* ---------------- @@ -1302,7 +1307,10 @@ ProcessTransactionCommand(GTMProxy_ConnectionInfo *conninfo, GTM_Conn *gtm_conn, break; case MSG_TXN_BEGIN_GETGXID_AUTOVACUUM: - case MSG_TXN_PREPARE: + case MSG_TXN_PREPARE: + case MSG_TXN_BEING_PREPARED: + case MSG_TXN_GET_GID_DATA: + case MSG_TXN_COMMIT_PREPARED: GTMProxy_ProxyCommand(conninfo, gtm_conn, mtype, message); break; diff --git a/src/include/access/gtm.h b/src/include/access/gtm.h index 4878d92..6740c86 100644 --- a/src/include/access/gtm.h +++ b/src/include/access/gtm.h @@ -24,6 +24,23 @@ extern GlobalTransactionId BeginTranGTM(GTM_Timestamp *timestamp); extern GlobalTransactionId BeginTranAutovacuumGTM(void); extern int CommitTranGTM(GlobalTransactionId gxid); extern int RollbackTranGTM(GlobalTransactionId gxid); +extern int BeingPreparedTranGTM(GlobalTransactionId gxid, + char *gid, + int datanodecnt, + PGXC_NodeId datanodes[], + int coordcount, + PGXC_NodeId coordinators[]); +extern int PrepareTranGTM(GlobalTransactionId gxid); +extern int GetGIDDataGTM(char *gid, + GlobalTransactionId *gxid, + GlobalTransactionId *prepared_gxid, + int *datanodecnt, + PGXC_NodeId **datanodes, + int *coordcnt, + PGXC_NodeId **coordinators); +extern int CommitPreparedTranGTM(GlobalTransactionId gxid, + GlobalTransactionId prepared_gxid); + extern GTM_Snapshot GetSnapshotGTM(GlobalTransactionId gxid, bool canbe_grouped); /* Sequence interface APIs with GTM */ diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h index a3a1492..485f7fa 100644 --- a/src/include/access/twophase.h +++ b/src/include/access/twophase.h @@ -19,6 +19,10 @@ #include "storage/proc.h" #include "utils/timestamp.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#endif + /* * GlobalTransactionData is defined in twophase.c; other places have no * business knowing the internal definition. @@ -38,6 +42,10 @@ extern GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid); +#ifdef PGXC +extern void RemoveGXactCoord(GlobalTransaction gxact); +#endif + extern void StartPrepare(GlobalTransaction gxact); extern void EndPrepare(GlobalTransaction gxact); diff --git a/src/include/gtm/gtm_c.h b/src/include/gtm/gtm_c.h index 0a4c941..da15df3 100644 --- a/src/include/gtm/gtm_c.h +++ b/src/include/gtm/gtm_c.h @@ -38,6 +38,7 @@ typedef uint32 GlobalTransactionId; /* 32-bit global transaction ids */ typedef uint32 PGXC_NodeId; typedef uint32 GTM_CoordinatorId; typedef int16 GTMProxy_ConnID; +typedef uint32 GTM_GIDLen; #define InvalidGTMProxyConnID -1 diff --git a/src/include/gtm/gtm_client.h b/src/include/gtm/gtm_client.h index 9db6884..4fe4bcf 100644 --- a/src/include/gtm/gtm_client.h +++ b/src/include/gtm/gtm_client.h @@ -29,7 +29,9 @@ typedef union GTM_ResultData } grd_gxid_tp; /* TXN_BEGIN_GETGXID */ GlobalTransactionId grd_gxid; /* TXN_PREPARE + * TXN_BEING_PREPARED * TXN_COMMIT + * TXN_COMMIT_PREPARED * TXN_ROLLBACK */ @@ -70,6 +72,16 @@ typedef union GTM_ResultData int status[GTM_MAX_GLOBAL_TRANSACTIONS]; } grd_txn_snap_multi; + struct + { + GlobalTransactionId gxid; + GlobalTransactionId prepared_gxid; + int datanodecnt; + int coordcnt; + PGXC_NodeId *datanodes; + PGXC_NodeId *coordinators; + } grd_txn_get_gid_data; /* TXN_GET_GID_DATA_RESULT */ + /* * TODO * TXN_GET_STATUS @@ -111,9 +123,16 @@ void disconnect_gtm(GTM_Conn *conn); GlobalTransactionId begin_transaction(GTM_Conn *conn, GTM_IsolationLevel isolevel, GTM_Timestamp *timestamp); GlobalTransactionId begin_transaction_autovacuum(GTM_Conn *conn, GTM_IsolationLevel isolevel); int commit_transaction(GTM_Conn *conn, GlobalTransactionId gxid); +int commit_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, GlobalTransactionId prepared_gxid); int abort_transaction(GTM_Conn *conn, GlobalTransactionId gxid); -int prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid, - int nodecnt, PGXC_NodeId nodes[]); +int being_prepared_transaction(GTM_Conn *conn, GlobalTransactionId gxid, char *gid, + int datanodecnt, PGXC_NodeId datanodes[], + int coordcnt, PGXC_NodeId coordinators[]); +int prepare_transaction(GTM_Conn *conn, GlobalTransactionId gxid); +int get_gid_data(GTM_Conn *conn, GTM_IsolationLevel isolevel, char *gid, + GlobalTransactionId *gxid, GlobalTransactionId *prepared_gxid, + int *datanodecnt, PGXC_NodeId **datanodes, int *coordcnt, + PGXC_NodeId **coordinators); /* * Snapshot Management API diff --git a/src/include/gtm/gtm_msg.h b/src/include/gtm/gtm_msg.h index e76e762..e1730eb 100644 --- a/src/include/gtm/gtm_msg.h +++ b/src/include/gtm/gtm_msg.h @@ -22,11 +22,14 @@ typedef enum GTM_MessageType MSG_TXN_BEGIN, /* Start a new transaction */ MSG_TXN_BEGIN_GETGXID, /* Start a new transaction and get GXID */ MSG_TXN_BEGIN_GETGXID_MULTI, /* Start multiple new transactions and get GXIDs */ - MSG_TXN_PREPARE, /* Prepare a transation for commit */ + MSG_TXN_BEING_PREPARED, /* Begins to prepare a transation for commit */ MSG_TXN_COMMIT, /* Commit a running or prepared transaction */ MSG_TXN_COMMIT_MULTI, /* Commit multiple running or prepared transactions */ + MSG_TXN_COMMIT_PREPARED, /* Commit a prepared transaction */ + MSG_TXN_PREPARE, /* Finish preparing a transaction */ MSG_TXN_ROLLBACK, /* Rollback a transaction */ MSG_TXN_ROLLBACK_MULTI, /* Rollback multiple transactions */ + MSG_TXN_GET_GID_DATA, /* Get info associated with a GID, and get a GXID */ MSG_TXN_GET_GXID, /* Get a GXID for a transaction */ MSG_SNAPSHOT_GET, /* Get a global snapshot */ MSG_SNAPSHOT_GET_MULTI, /* Get multiple global snapshots */ @@ -59,10 +62,13 @@ typedef enum GTM_ResultType TXN_BEGIN_GETGXID_RESULT, TXN_BEGIN_GETGXID_MULTI_RESULT, TXN_PREPARE_RESULT, + TXN_BEING_PREPARED_RESULT, + TXN_COMMIT_PREPARED_RESULT, TXN_COMMIT_RESULT, TXN_COMMIT_MULTI_RESULT, TXN_ROLLBACK_RESULT, TXN_ROLLBACK_MULTI_RESULT, + TXN_GET_GID_DATA_RESULT, TXN_GET_GXID_RESULT, SNAPSHOT_GET_RESULT, SNAPSHOT_GET_MULTI_RESULT, diff --git a/src/include/gtm/gtm_txn.h b/src/include/gtm/gtm_txn.h index 2d78946..5e3a02c 100644 --- a/src/include/gtm/gtm_txn.h +++ b/src/include/gtm/gtm_txn.h @@ -116,8 +116,11 @@ typedef struct GTM_TransactionInfo GTM_IsolationLevel gti_isolevel; bool... [truncated message content] |
From: mason_s <ma...@us...> - 2010-09-06 23:58:58
|
Project "Postgres-XC". The branch, master has been updated via 19a8fa536779653524a1feb862c18277efa317f4 (commit) from 06c882f78694a31749746aad0cb76347a3f7bcef (commit) - Log ----------------------------------------------------------------- commit 19a8fa536779653524a1feb862c18277efa317f4 Author: Mason Sharp <ma...@us...> Date: Mon Sep 6 19:54:53 2010 -0400 Improved error handling. The primary focus is to better handle the case of a stopped or crashed data node on the coordinator. Also, before a rollback make sure connections are clean. If there was an error, tell the pooler to destroy the connections instead of returning them to the pools, even the data node connections that did not have an error but are involved in the statement. This is becaue there may be some remaining messages buffered or in transit, and could affect subsequent requests. diff --git a/src/backend/pgxc/pool/datanode.c b/src/backend/pgxc/pool/datanode.c index ba56ca1..31b5bc0 100644 --- a/src/backend/pgxc/pool/datanode.c +++ b/src/backend/pgxc/pool/datanode.c @@ -37,7 +37,6 @@ #include "utils/snapmgr.h" #include "../interfaces/libpq/libpq-fe.h" -#define NO_SOCKET -1 static int node_count = 0; static DataNodeHandle *handles = NULL; @@ -280,7 +279,8 @@ retry: { add_error_message(conn, "unexpected EOF on datanode connection"); elog(WARNING, "unexpected EOF on datanode connection"); - return EOF; + /* Should we read from the other connections before returning? */ + return EOF; } else { @@ -429,6 +429,18 @@ retry: } +/* + * Clear out socket data and buffer. + * Throw away any data. + */ +void +clear_socket_data (DataNodeHandle *conn) +{ + do { + conn->inStart = conn->inCursor = conn->inEnd = 0; + } while (data_node_read_data(conn) > 0); +} + /* * Get one character from the connection buffer and advance cursor */ @@ -529,14 +541,20 @@ get_message(DataNodeHandle *conn, int *len, char **msg) } -/* Release all data node connections back to pool and release occupied memory */ +/* + * Release all data node connections back to pool and release occupied memory + * + * If force_drop is true, we force dropping all of the connections, such as after + * a rollback, which was likely issued due to an error. + */ void -release_handles(void) +release_handles(bool force_drop) { int i; int discard[NumDataNodes]; int ndisc = 0; + if (node_count == 0) return; @@ -546,7 +564,9 @@ release_handles(void) if (handle->sock != NO_SOCKET) { - if (handle->state != DN_CONNECTION_STATE_IDLE) + if (force_drop) + discard[ndisc++] = handle->nodenum; + else if (handle->state != DN_CONNECTION_STATE_IDLE) { elog(WARNING, "Connection to data node %d has unexpected state %d and will be dropped", handle->nodenum, handle->state); discard[ndisc++] = handle->nodenum; @@ -1070,6 +1090,12 @@ get_transaction_nodes(DataNodeHandle **connections) { for (i = 0; i < NumDataNodes; i++) { + /* + * We may want to consider also not returning connections with a + * state of DN_CONNECTION_STATE_ERROR_NOT_READY or + * DN_CONNECTION_STATE_ERROR_FATAL. + * ERROR_NOT_READY can happen if the data node abruptly disconnects. + */ if (handles[i].sock != NO_SOCKET && handles[i].transaction_status != 'I') connections[tran_count++] = &handles[i]; } @@ -1077,3 +1103,29 @@ get_transaction_nodes(DataNodeHandle **connections) return tran_count; } + +/* + * Return those node connections that appear to be active and + * have data to consume on them. + */ +int +get_active_nodes (DataNodeHandle **connections) +{ + int active_count = 0; + int i; + + if (node_count) + { + for (i = 0; i < NumDataNodes; i++) + { + if (handles[i].sock != NO_SOCKET && + handles[i].state != DN_CONNECTION_STATE_IDLE && + handles[i].state != DN_CONNECTION_STATE_ERROR_NOT_READY && + handles[i].state != DN_CONNECTION_STATE_ERROR_FATAL) + connections[active_count++] = &handles[i]; + } + } + + return active_count; +} + diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index f065289..05dbe2e 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -15,6 +15,7 @@ *------------------------------------------------------------------------- */ +#include <time.h> #include "postgres.h" #include "access/gtm.h" #include "access/xact.h" @@ -30,6 +31,10 @@ #include "utils/tuplesort.h" #include "utils/snapmgr.h" +#define END_QUERY_TIMEOUT 20 +#define CLEAR_TIMEOUT 5 + + extern char *deparseSql(RemoteQueryState *scanstate); /* @@ -50,6 +55,9 @@ static int data_node_rollback(int conn_count, DataNodeHandle ** connections); static void clear_write_node_list(); +static int handle_response_clear(DataNodeHandle * conn); + + #define MAX_STATEMENTS_PER_TRAN 10 /* Variables to collect statistics */ @@ -761,7 +769,8 @@ HandleError(RemoteQueryState *combiner, char *msg_body, size_t len) { combiner->errorMessage = pstrdup(message); /* Error Code is exactly 5 significant bytes */ - memcpy(combiner->errorCode, code, 5); + if (code) + memcpy(combiner->errorCode, code, 5); } /* @@ -916,7 +925,7 @@ data_node_receive_responses(const int conn_count, DataNodeHandle ** connections, * Read results. * Note we try and read from data node connections even if there is an error on one, * so as to avoid reading incorrect results on the next statement. - * It might be better to just destroy these connections and tell the pool manager. + * Other safegaurds exist to avoid this, however. */ while (count > 0) { @@ -971,6 +980,7 @@ handle_response(DataNodeHandle * conn, RemoteQueryState *combiner) { char *msg; int msg_len; + char msg_type; for (;;) { @@ -991,7 +1001,8 @@ handle_response(DataNodeHandle * conn, RemoteQueryState *combiner) } /* TODO handle other possible responses */ - switch (get_message(conn, &msg_len, &msg)) + msg_type = get_message(conn, &msg_len, &msg); + switch (msg_type) { case '\0': /* Not enough data in the buffer */ conn->state = DN_CONNECTION_STATE_QUERY; @@ -1056,15 +1067,85 @@ handle_response(DataNodeHandle * conn, RemoteQueryState *combiner) case 'I': /* EmptyQuery */ default: /* sync lost? */ + elog(WARNING, "Received unsupported message type: %c", msg_type); conn->state = DN_CONNECTION_STATE_ERROR_FATAL; return RESPONSE_EOF; } } - /* Keep compiler quiet */ + return RESPONSE_EOF; } /* + * Like handle_response, but for consuming the messages, + * in case we of an error to clean the data node connection. + * Return values: + * RESPONSE_EOF - need to receive more data for the connection + * RESPONSE_COMPLETE - done with the connection, or done trying (error) + */ +static int +handle_response_clear(DataNodeHandle * conn) +{ + char *msg; + int msg_len; + char msg_type; + + for (;;) + { + /* No data available, exit */ + if (conn->state == DN_CONNECTION_STATE_QUERY) + return RESPONSE_EOF; + + /* + * If we are in the process of shutting down, we + * may be rolling back, and the buffer may contain other messages. + * We want to avoid a procarray exception + * as well as an error stack overflow. + */ + if (proc_exit_inprogress) + { + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + return RESPONSE_COMPLETE; + } + + msg_type = get_message(conn, &msg_len, &msg); + switch (msg_type) + { + case '\0': /* Not enough data in the buffer */ + case 'c': /* CopyToCommandComplete */ + case 'C': /* CommandComplete */ + case 'T': /* RowDescription */ + case 'D': /* DataRow */ + case 'H': /* CopyOutResponse */ + case 'd': /* CopyOutDataRow */ + case 'A': /* NotificationResponse */ + case 'N': /* NoticeResponse */ + break; + case 'E': /* ErrorResponse */ + conn->state = DN_CONNECTION_STATE_ERROR_NOT_READY; + /* + * Do not return with an error, we still need to consume Z, + * ready-for-query + */ + break; + case 'Z': /* ReadyForQuery */ + conn->transaction_status = msg[0]; + conn->state = DN_CONNECTION_STATE_IDLE; + return RESPONSE_COMPLETE; + case 'I': /* EmptyQuery */ + default: + /* sync lost? */ + elog(WARNING, "Received unsupported message type: %c", msg_type); + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + return RESPONSE_COMPLETE; + } + } + + return RESPONSE_EOF; +} + + +/* * Send BEGIN command to the Data nodes and receive responses */ static int @@ -1150,13 +1231,13 @@ finish: if (!autocommit) stat_transaction(tran_count); if (!PersistentConnections) - release_handles(); + release_handles(false); autocommit = true; clear_write_node_list(); if (res != 0) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Could not commit connection on data nodes"))); + errmsg("Could not commit (or autocommit) data node connection"))); } @@ -1271,6 +1352,7 @@ finish: /* * Rollback current transaction + * This will happen */ int DataNodeRollback(void) @@ -1279,6 +1361,10 @@ DataNodeRollback(void) int tran_count; DataNodeHandle *connections[NumDataNodes]; + + /* Consume any messages on the data nodes first if necessary */ + DataNodeConsumeMessages(); + /* gather connections to rollback */ tran_count = get_transaction_nodes(connections); @@ -1296,7 +1382,7 @@ finish: if (!autocommit) stat_transaction(tran_count); if (!PersistentConnections) - release_handles(); + release_handles(true); autocommit = true; clear_write_node_list(); return res; @@ -1313,11 +1399,19 @@ data_node_rollback(int conn_count, DataNodeHandle ** connections) struct timeval *timeout = NULL; RemoteQueryState *combiner; + + /* + * Rollback is a special case, being issued because of an error. + * We try to read and throw away any extra data on the connection before + * issuing our rollbacks so that we did not read the results of the + * previous command. + */ + for (i = 0; i < conn_count; i++) + clear_socket_data(connections[i]); + /* Send ROLLBACK - */ for (i = 0; i < conn_count; i++) - { data_node_send_query(connections[i], "ROLLBACK"); - } combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); /* Receive responses */ @@ -1487,7 +1581,7 @@ DataNodeCopyBegin(const char *query, List *nodelist, Snapshot snapshot, bool is_ if (need_tran) DataNodeCopyFinish(connections, 0, COMBINE_TYPE_NONE); else if (!PersistentConnections) - release_handles(); + release_handles(false); } pfree(connections); @@ -1711,7 +1805,7 @@ DataNodeCopyOut(Exec_Nodes *exec_nodes, DataNodeHandle** copy_connections, FILE* if (!ValidateAndCloseCombiner(combiner)) { if (autocommit && !PersistentConnections) - release_handles(); + release_handles(false); pfree(copy_connections); ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -2136,8 +2230,10 @@ ExecRemoteQuery(RemoteQueryState *node) if (connections[i]->transaction_status != 'T') new_connections[new_count++] = connections[i]; - if (new_count) - data_node_begin(new_count, new_connections, gxid); + if (new_count && data_node_begin(new_count, new_connections, gxid)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data nodes."))); } /* Get the SQL string */ @@ -2292,7 +2388,7 @@ ExecRemoteQuery(RemoteQueryState *node) { ExecSetSlotDescriptor(scanslot, node->tuple_desc); /* - * Now tuple table slot is responcible for freeing the + * Now tuple table slot is responsible for freeing the * descriptor */ node->tuple_desc = NULL; @@ -2492,9 +2588,88 @@ ExecRemoteQuery(RemoteQueryState *node) return resultslot; } +/* + * End the remote query + */ void ExecEndRemoteQuery(RemoteQueryState *node) { + + /* + * If processing was interrupted, (ex: client did not consume all the data, + * or a subquery with LIMIT) we may still have data on the nodes. Try and consume. + * We do not simply call DataNodeConsumeMessages, because the same + * connection could be used for multiple RemoteQuery steps. + * + * It seems most stable checking command_complete_count + * and only then working with conn_count + * + * PGXCTODO: Change in the future when we remove materialization nodes. + */ + if (node->command_complete_count < node->node_count) + { + elog(WARNING, "Extra data node messages when ending remote query step"); + + while (node->conn_count > 0) + { + int i = 0; + int res; + + /* + * Just consume the rest of the messages + */ + if ((i = node->current_conn + 1) == node->conn_count) + i = 0; + + for (;;) + { + /* throw away message */ + if (node->msg) + { + pfree(node->msg); + node->msg = NULL; + } + + res = handle_response(node->connections[i], node); + + if (res == RESPONSE_COMPLETE || + node->connections[i]->state == DN_CONNECTION_STATE_ERROR_FATAL || + node->connections[i]->state == DN_CONNECTION_STATE_ERROR_NOT_READY) + { + if (--node->conn_count == 0) + break; + if (i == node->conn_count) + i = 0; + else + node->connections[i] = node->connections[node->conn_count]; + if (node->current_conn == node->conn_count) + node->current_conn = i; + } + else if (res == RESPONSE_EOF) + { + /* go to next connection */ + if (++i == node->conn_count) + i = 0; + + /* if we cycled over all connections we need to receive more */ + if (i == node->current_conn) + { + struct timeval timeout; + timeout.tv_sec = END_QUERY_TIMEOUT; + timeout.tv_usec = 0; + + if (data_node_receive(node->conn_count, node->connections, &timeout)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to read response from data nodes when ending query"))); + } + } + } + } + elog(WARNING, "Data node connection buffers cleaned"); + } + + /* * Release tuplesort resources */ @@ -2517,6 +2692,64 @@ ExecEndRemoteQuery(RemoteQueryState *node) CloseCombiner(node); } +/* + * Consume any remaining messages on the connections. + * This is useful for calling after ereport() + */ +void +DataNodeConsumeMessages(void) +{ + int i; + int active_count = 0; + int res; + struct timeval timeout; + DataNodeHandle *connection = NULL; + DataNodeHandle **connections = NULL; + DataNodeHandle *active_connections[NumDataNodes]; + + + active_count = get_active_nodes(active_connections); + + /* Iterate through handles in use and try and clean */ + for (i = 0; i < active_count; i++) + { + elog(WARNING, "Consuming data node messages after error."); + + connection = active_connections[i]; + + res = RESPONSE_EOF; + + while (res != RESPONSE_COMPLETE) + { + int res = handle_response_clear(connection); + + if (res == RESPONSE_EOF) + { + if (!connections) + connections = (DataNodeHandle **) palloc(sizeof(DataNodeHandle*)); + + connections[0] = connection; + + /* Use a timeout so we do not wait forever */ + timeout.tv_sec = CLEAR_TIMEOUT; + timeout.tv_usec = 0; + if (data_node_receive(1, connections, &timeout)) + { + /* Mark this as bad, move on to next one */ + connection->state = DN_CONNECTION_STATE_ERROR_FATAL; + break; + } + } + if (connection->state == DN_CONNECTION_STATE_ERROR_FATAL + || connection->state == DN_CONNECTION_STATE_IDLE) + break; + } + } + + if (connections) + pfree(connections); +} + /* ---------------------------------------------------------------- * ExecRemoteQueryReScan @@ -2609,8 +2842,11 @@ ExecRemoteUtility(RemoteQuery *node) if (connections[i]->transaction_status != 'T') new_connections[new_count++] = connections[i]; - if (new_count) - data_node_begin(new_count, new_connections, gxid); + if (new_count && data_node_begin(new_count, new_connections, gxid)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Could not begin transaction on data nodes"))); + } /* See if we have a primary nodes, execute on it first before the others */ @@ -2760,10 +2996,11 @@ DataNodeCleanAndRelease(int code, Datum arg) /* Rollback on GTM if transaction id opened. */ RollbackTranGTM((GlobalTransactionId) GetCurrentTransactionIdIfAny()); - } - /* Release data node connections */ - release_handles(); + release_handles(true); + } else + /* Release data node connections */ + release_handles(false); /* Close connection with GTM */ CloseGTM(); diff --git a/src/include/pgxc/datanode.h b/src/include/pgxc/datanode.h index 4202e2e..4039c45 100644 --- a/src/include/pgxc/datanode.h +++ b/src/include/pgxc/datanode.h @@ -23,6 +23,9 @@ #include "utils/snapshot.h" #include <unistd.h> +#define NO_SOCKET -1 + + /* Connection to data node maintained by Pool Manager */ typedef struct PGconn NODE_CONNECTION; @@ -80,8 +83,9 @@ extern int DataNodeConnClean(NODE_CONNECTION * conn); extern void DataNodeCleanAndRelease(int code, Datum arg); extern DataNodeHandle **get_handles(List *nodelist); -extern void release_handles(void); +extern void release_handles(bool force_drop); extern int get_transaction_nodes(DataNodeHandle ** connections); +extern int get_active_nodes(DataNodeHandle ** connections); extern int ensure_in_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle); extern int ensure_out_buffer_capacity(size_t bytes_needed, DataNodeHandle * handle); @@ -100,5 +104,6 @@ extern int data_node_flush(DataNodeHandle *handle); extern char get_message(DataNodeHandle *conn, int *len, char **msg); extern void add_error_message(DataNodeHandle * handle, const char *message); +extern void clear_socket_data (DataNodeHandle *conn); #endif diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 143c8fa..fbc4db0 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -96,6 +96,7 @@ extern int handle_response(DataNodeHandle * conn, RemoteQueryState *combiner); extern bool FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot); extern void ExecRemoteQueryReScan(RemoteQueryState *node, ExprContext *exprCtxt); +extern void DataNodeConsumeMessages(void); extern int primary_data_node; #endif ----------------------------------------------------------------------- Summary of changes: src/backend/pgxc/pool/datanode.c | 62 ++++++++- src/backend/pgxc/pool/execRemote.c | 275 +++++++++++++++++++++++++++++++++--- src/include/pgxc/datanode.h | 7 +- src/include/pgxc/execRemote.h | 1 + 4 files changed, 320 insertions(+), 25 deletions(-) hooks/post-receive -- Postgres-XC |