You can subscribe to this list here.
2010 |
Jan
|
Feb
|
Mar
|
Apr
(4) |
May
(28) |
Jun
(12) |
Jul
(11) |
Aug
(12) |
Sep
(5) |
Oct
(19) |
Nov
(14) |
Dec
(12) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2011 |
Jan
(18) |
Feb
(30) |
Mar
(115) |
Apr
(89) |
May
(50) |
Jun
(44) |
Jul
(22) |
Aug
(13) |
Sep
(11) |
Oct
(30) |
Nov
(28) |
Dec
(39) |
2012 |
Jan
(38) |
Feb
(18) |
Mar
(43) |
Apr
(91) |
May
(108) |
Jun
(46) |
Jul
(37) |
Aug
(44) |
Sep
(33) |
Oct
(29) |
Nov
(36) |
Dec
(15) |
2013 |
Jan
(35) |
Feb
(611) |
Mar
(5) |
Apr
(55) |
May
(30) |
Jun
(28) |
Jul
(458) |
Aug
(34) |
Sep
(9) |
Oct
(39) |
Nov
(22) |
Dec
(32) |
2014 |
Jan
(16) |
Feb
(16) |
Mar
(42) |
Apr
(179) |
May
(7) |
Jun
(6) |
Jul
(9) |
Aug
|
Sep
(4) |
Oct
|
Nov
(3) |
Dec
|
2015 |
Jan
|
Feb
|
Mar
|
Apr
(2) |
May
(4) |
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
S | M | T | W | T | F | S |
---|---|---|---|---|---|---|
|
|
|
1
(2) |
2
(3) |
3
|
4
|
5
|
6
(2) |
7
(1) |
8
|
9
(4) |
10
|
11
|
12
|
13
|
14
(1) |
15
(1) |
16
(1) |
17
|
18
|
19
|
20
|
21
(1) |
22
|
23
|
24
(2) |
25
|
26
|
27
(2) |
28
(1) |
29
(1) |
30
(22) |
|
|
From: Ashutosh B. <ash...@us...> - 2011-06-27 06:44:54
|
Project "Postgres-XC". The branch, master has been updated via 805bb9aeb9aeddc8db1283e4724c108941785e8b (commit) from ff7be6e332b36fc7aad99876bf107e258264a7f1 (commit) - Log ----------------------------------------------------------------- commit 805bb9aeb9aeddc8db1283e4724c108941785e8b Author: Ashutosh Bapat <ash...@en...> Date: Mon Jun 27 12:05:43 2011 +0530 For every aggregate, we expect that the collection function is such that, it being applied on the transition results from datanode, should produce the same result as that produced by only transition function being applied on all qualifying rows. This condition allows us to apply final function directly on transition result (in case transition phase takes place on coordinator) or collection result (in case transition phase takes place on datanodes and collection phase is applied on the coordinator), producing same aggregation results at the end. This also means that, on a given node (data or coordinator) only one of the transition or collection phases is applied not both. Hence, we don't need to apply collection after transition, when transition phase takes place on coordinator. This also renders aggcollecttype from pg_aggregate useless, hence removed the column aggcollecttype from catalog pg_aggregate. The input and output type of collection phase is aggtranstype. There is no need for separate collecttypeLen and collecttypeByVal members in AggStatePerAggData, that purpose is served by transtypeLen and transtypeByVal diff --git a/src/backend/catalog/pg_aggregate.c b/src/backend/catalog/pg_aggregate.c index f80874d..6971dc1 100644 --- a/src/backend/catalog/pg_aggregate.c +++ b/src/backend/catalog/pg_aggregate.c @@ -54,7 +54,6 @@ AggregateCreate(const char *aggName, List *aggsortopName, Oid aggTransType, #ifdef PGXC - Oid aggCollectType, const char *agginitval, const char *agginitcollect) #else @@ -175,30 +174,25 @@ AggregateCreate(const char *aggName, #ifdef PGXC /* - * Collection function must be of two arguments - * First must be of aggCollectType, second must be of aggTransType - * Return value must be of aggCollectType + * Collection function must be of two arguments, both of type aggTransType + * and return type is also aggTransType */ - fnArgs[0] = aggCollectType; + fnArgs[0] = aggTransType; fnArgs[1] = aggTransType; collectfn = lookup_agg_function(aggcollectfnName, 2, fnArgs, &rettype); - if (rettype != aggCollectType) + if (rettype != aggTransType) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("return type of collection function %s is not %s", NameListToString(aggcollectfnName), - format_type_be(aggCollectType)))); + format_type_be(aggTransType)))); #endif /* handle finalfn, if supplied */ if (aggfinalfnName) { -#ifdef PGXC - fnArgs[0] = aggCollectType; -#else fnArgs[0] = aggTransType; -#endif finalfn = lookup_agg_function(aggfinalfnName, 1, fnArgs, &finaltype); } @@ -207,11 +201,7 @@ AggregateCreate(const char *aggName, /* * If no finalfn, aggregate result type is type of the state value */ -#ifdef PGXC - finaltype = aggCollectType; -#else finaltype = aggTransType; -#endif } Assert(OidIsValid(finaltype)); @@ -302,7 +292,6 @@ AggregateCreate(const char *aggName, values[Anum_pg_aggregate_aggtranstype - 1] = ObjectIdGetDatum(aggTransType); #ifdef PGXC values[Anum_pg_aggregate_aggcollectfn - 1] = ObjectIdGetDatum(collectfn); - values[Anum_pg_aggregate_aggcollecttype - 1] = ObjectIdGetDatum(aggCollectType); #endif if (agginitval) values[Anum_pg_aggregate_agginitval - 1] = CStringGetTextDatum(agginitval); diff --git a/src/backend/commands/aggregatecmds.c b/src/backend/commands/aggregatecmds.c index e73464b..6b9f86e 100644 --- a/src/backend/commands/aggregatecmds.c +++ b/src/backend/commands/aggregatecmds.c @@ -139,6 +139,10 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters) ereport(ERROR, (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), errmsg("aggregate cfunc must be specified"))); + if (collectType != transType) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("aggregate ctype should be same as aggregate stype"))); #endif /* @@ -247,7 +251,6 @@ DefineAggregate(List *name, List *args, bool oldstyle, List *parameters) sortoperatorName, /* sort operator name */ transTypeId, /* transition data type */ #ifdef PGXC - collectTypeId, /* collection data type */ initval, /* initial condition */ initcollect); /* initial condition for collection function */ #else diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index ae60a76..215ed5f 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -176,15 +176,9 @@ typedef struct AggStatePerAggData int16 inputtypeLen, resulttypeLen, transtypeLen; -#ifdef PGXC - int16 collecttypeLen; -#endif /* PGXC */ bool inputtypeByVal, resulttypeByVal, transtypeByVal; -#ifdef PGXC - bool collecttypeByVal; -#endif /* PGXC */ /* * Stuff for evaluation of inputs. We used to just use ExecEvalExpr, but @@ -388,6 +382,7 @@ initialize_aggregates(AggState *aggstate, * * Note that when the initial value is pass-by-ref, we must copy it * (into the aggcontext) since we will pfree the collectValue later. + * collection type is same as transition type. */ if (peraggstate->initCollectValueIsNull) pergroupstate->collectValue = peraggstate->initCollectValue; @@ -397,8 +392,8 @@ initialize_aggregates(AggState *aggstate, oldContext = MemoryContextSwitchTo(aggstate->aggcontext); pergroupstate->collectValue = datumCopy(peraggstate->initCollectValue, - peraggstate->collecttypeByVal, - peraggstate->collecttypeLen); + peraggstate->transtypeByVal, + peraggstate->transtypeLen); MemoryContextSwitchTo(oldContext); } pergroupstate->collectValueIsNull = peraggstate->initCollectValueIsNull; @@ -557,22 +552,16 @@ advance_collection_function(AggState *aggstate, if (pergroupstate->noCollectValue) { /* - * collection result has not been initialized + * collection result has not been initialized. This is the first non-NULL + * transition value. We use it as the initial value for collectValue. + * Aggregate's transition and collection type are same * We must copy the datum into result if it is pass-by-ref. We * do not need to pfree the old result, since it's NULL. - * PGXCTODO: in case the transition result type is different from - * collection result type, this code would not work, since we are - * assigning datum of one type to another. For this code to work the - * input and output of collection function needs to be binary - * compatible which is not. So, either check in AggregateCreate, - * that the input and output of collection function are binary - * coercible or set the initial values something non-null or change - * this code */ oldContext = MemoryContextSwitchTo(aggstate->aggcontext); pergroupstate->collectValue = datumCopy(fcinfo->arg[1], - peraggstate->collecttypeByVal, - peraggstate->collecttypeLen); + peraggstate->transtypeByVal, + peraggstate->transtypeLen); pergroupstate->collectValueIsNull = false; pergroupstate->noCollectValue = false; MemoryContextSwitchTo(oldContext); @@ -606,15 +595,15 @@ advance_collection_function(AggState *aggstate, * pfree the prior transValue. But if collectfn returned a pointer to its * first input, we don't need to do anything. */ - if (!peraggstate->collecttypeByVal && + if (!peraggstate->transtypeByVal && DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->collectValue)) { if (!fcinfo->isnull) { MemoryContextSwitchTo(aggstate->aggcontext); newVal = datumCopy(newVal, - peraggstate->collecttypeByVal, - peraggstate->collecttypeLen); + peraggstate->transtypeByVal, + peraggstate->transtypeLen); } if (!pergroupstate->collectValueIsNull) pfree(DatumGetPointer(pergroupstate->collectValue)); @@ -909,51 +898,6 @@ finalize_aggregate(AggState *aggstate, oldContext = MemoryContextSwitchTo(aggstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory); #ifdef PGXC /* - * PGXCTODO: see PGXCTODO item in advance_collect_function - * this step is needed in case the transition function does not produce - * result consumable by final function and need collection function to be - * applied on transition function results. Usually results by both functions - * should be consumable by final function. - * As such this step is meant only to convert transition results into form - * consumable by final function, the step does not actually do any - * collection. Skipping transitionp means, that the collection - * phase is over and we need to apply final function directly. - */ - if (OidIsValid(peraggstate->collectfn_oid) && !aggstate->skip_trans) - { - FunctionCallInfoData fcinfo; - int saved_numArguments; - InitFunctionCallInfoData(fcinfo, &(peraggstate->collectfn), 2, - (void *) aggstate, NULL); - /* - * copy the initial datum since it might get changed inside the - * collection function - */ - if (peraggstate->initCollectValueIsNull) - fcinfo.arg[0] = peraggstate->initCollectValue; - else - fcinfo.arg[0] = datumCopy(peraggstate->initCollectValue, - peraggstate->collecttypeByVal, - peraggstate->collecttypeLen); - fcinfo.argnull[0] = peraggstate->initCollectValueIsNull; - fcinfo.arg[1] = pergroupstate->transValue; - fcinfo.argnull[1] = pergroupstate->transValueIsNull; - /* - * For collection function we expect only one argument other than the - * running collection result. The numArguments in peraggstate - * corresponds to the number of arguments to the aggregate, which is not - * correct for collection. Hence while applying collection function - * set numArguments to 1 and switch it back once the purpose is served. - */ - saved_numArguments = peraggstate->numArguments; - peraggstate->numArguments = 1; - advance_collection_function(aggstate, peraggstate, pergroupstate, &fcinfo); - peraggstate->numArguments = saved_numArguments; - pergroupstate->transValue = pergroupstate->collectValue; - pergroupstate->transValueIsNull = pergroupstate->collectValueIsNull; - } - - /* * if we skipped the transition phase, we have the collection result in the * collectValue, move it to transValue for finalization to work on */ @@ -1945,14 +1889,14 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) Expr *dummyexpr; /* * for XC, we need to setup the collection function expression as well. - * Use the same function with invalid final function oid, and collection + * Use build_aggregate_fnexpr() with invalid final function oid, and collection * function information instead of transition function information. * PGXCTODO: we should really be adding this step inside * build_aggregate_fnexprs() but this way it becomes easy to merge. */ - build_aggregate_fnexprs(&aggform->aggtranstype, + build_aggregate_fnexprs(&aggtranstype, 1, - aggform->aggcollecttype, + aggtranstype, aggref->aggtype, collectfn_oid, InvalidOid, @@ -1985,11 +1929,6 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) get_typlenbyval(aggtranstype, &peraggstate->transtypeLen, &peraggstate->transtypeByVal); -#ifdef PGXC - get_typlenbyval(aggform->aggcollecttype, - &peraggstate->collecttypeLen, - &peraggstate->collecttypeByVal); -#endif /* PGXC */ /* * initval is potentially null, so don't try to access it as a struct @@ -2019,7 +1958,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags) peraggstate->initCollectValue = (Datum) 0; else peraggstate->initCollectValue = GetAggInitVal(textInitVal, - aggform->aggcollecttype); + aggtranstype); #endif /* PGXC */ /* diff --git a/src/include/catalog/pg_aggregate.h b/src/include/catalog/pg_aggregate.h index d53a632..5c2103c 100644 --- a/src/include/catalog/pg_aggregate.h +++ b/src/include/catalog/pg_aggregate.h @@ -40,9 +40,6 @@ * aggfinalfn final function (0 if none) * aggsortop associated sort operator (0 if none) * aggtranstype type of aggregate's transition (state) data -#ifdef PGXC - * aggcollecttype type of aggregate's collection (state) data -#endif * agginitval initial value for transition state (can be NULL) #ifdef PGXC * agginitcollect initial value for collection state (can be NULL) @@ -58,8 +55,9 @@ CATALOG(pg_aggregate,2600) BKI_WITHOUT_OIDS regproc aggcollectfn; /* PGXC */ regproc aggfinalfn; Oid aggsortop; - Oid aggtranstype; - Oid aggcollecttype; /* PGXC */ + Oid aggtranstype; /* also serves as the input and output type + * of aggcollectfn + */ text agginitval; /* VARIABLE LENGTH FIELD */ text agginitcollect; /* PGXC, VARIABLE LENGTH FIELD */ } FormData_pg_aggregate; @@ -77,16 +75,15 @@ typedef FormData_pg_aggregate *Form_pg_aggregate; */ #ifdef PGXC -#define Natts_pg_aggregate 9 +#define Natts_pg_aggregate 8 #define Anum_pg_aggregate_aggfnoid 1 #define Anum_pg_aggregate_aggtransfn 2 #define Anum_pg_aggregate_aggcollectfn 3 #define Anum_pg_aggregate_aggfinalfn 4 #define Anum_pg_aggregate_aggsortop 5 #define Anum_pg_aggregate_aggtranstype 6 -#define Anum_pg_aggregate_aggcollecttype 7 -#define Anum_pg_aggregate_agginitval 8 -#define Anum_pg_aggregate_agginitcollect 9 +#define Anum_pg_aggregate_agginitval 7 +#define Anum_pg_aggregate_agginitcollect 8 #endif #ifdef PGXC //#define Natts_pg_aggregate 6 @@ -106,13 +103,13 @@ typedef FormData_pg_aggregate *Form_pg_aggregate; /* avg */ #ifdef PGXC -DATA(insert ( 2100 int8_avg_accum numeric_avg_collect numeric_avg 0 1231 1231 "{0,0}" "{0,0}" )); -DATA(insert ( 2101 int4_avg_accum int8_avg_collect int8_avg 0 1016 1016 "{0,0}" "{0,0}" )); -DATA(insert ( 2102 int2_avg_accum int8_avg_collect int8_avg 0 1016 1016 "{0,0}" "{0,0}" )); -DATA(insert ( 2103 numeric_avg_accum numeric_avg_collect numeric_avg 0 1231 1231 "{0,0}" "{0,0}" )); -DATA(insert ( 2104 float4_accum float8_collect float8_avg 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2105 float8_accum float8_collect float8_avg 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2106 interval_accum interval_collect interval_avg 0 1187 1187 "{0 second,0 second}" "{0 second,0 second}" )); +DATA(insert ( 2100 int8_avg_accum numeric_avg_collect numeric_avg 0 1231 "{0,0}" "{0,0}" )); +DATA(insert ( 2101 int4_avg_accum int8_avg_collect int8_avg 0 1016 "{0,0}" "{0,0}" )); +DATA(insert ( 2102 int2_avg_accum int8_avg_collect int8_avg 0 1016 "{0,0}" "{0,0}" )); +DATA(insert ( 2103 numeric_avg_accum numeric_avg_collect numeric_avg 0 1231 "{0,0}" "{0,0}" )); +DATA(insert ( 2104 float4_accum float8_collect float8_avg 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2105 float8_accum float8_collect float8_avg 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2106 interval_accum interval_collect interval_avg 0 1187 "{0 second,0 second}" "{0 second,0 second}" )); #endif #ifdef PGXC //DATA(insert ( 2100 int8_avg_accum numeric_avg 0 1231 "{0,0}" )); @@ -126,14 +123,14 @@ DATA(insert ( 2106 interval_accum interval_collect interval_avg 0 1187 1187 "{0 /* sum */ #ifdef PGXC -DATA(insert ( 2107 int8_sum numeric_add - 0 1700 1700 _null_ "0" )); -DATA(insert ( 2108 int4_sum int8_sum_to_int8 - 0 20 20 _null_ _null_ )); -DATA(insert ( 2109 int2_sum int8_sum_to_int8 - 0 20 20 _null_ _null_ )); -DATA(insert ( 2110 float4pl float4pl - 0 700 700 _null_ "0" )); -DATA(insert ( 2111 float8pl float8pl - 0 701 701 _null_ "0" )); -DATA(insert ( 2112 cash_pl cash_pl - 0 790 790 _null_ _null_ )); -DATA(insert ( 2113 interval_pl interval_pl - 0 1186 1186 _null_ _null_ )); -DATA(insert ( 2114 numeric_add numeric_add - 0 1700 1700 _null_ "0" )); +DATA(insert ( 2107 int8_sum numeric_add - 0 1700 _null_ "0" )); +DATA(insert ( 2108 int4_sum int8_sum_to_int8 - 0 20 _null_ _null_ )); +DATA(insert ( 2109 int2_sum int8_sum_to_int8 - 0 20 _null_ _null_ )); +DATA(insert ( 2110 float4pl float4pl - 0 700 _null_ "0" )); +DATA(insert ( 2111 float8pl float8pl - 0 701 _null_ "0" )); +DATA(insert ( 2112 cash_pl cash_pl - 0 790 _null_ _null_ )); +DATA(insert ( 2113 interval_pl interval_pl - 0 1186 _null_ _null_ )); +DATA(insert ( 2114 numeric_add numeric_add - 0 1700 _null_ "0" )); #endif #ifdef PGXC //DATA(insert ( 2107 int8_sum - 0 1700 _null_ )); @@ -148,26 +145,26 @@ DATA(insert ( 2114 numeric_add numeric_add - 0 1700 1700 _null_ "0" )); /* max */ #ifdef PGXC -DATA(insert ( 2115 int8larger int8larger - 413 20 20 _null_ _null_ )); -DATA(insert ( 2116 int4larger int4larger - 521 23 23 _null_ _null_ )); -DATA(insert ( 2117 int2larger int2larger - 520 21 21 _null_ _null_ )); -DATA(insert ( 2118 oidlarger oidlarger - 610 26 26 _null_ _null_ )); -DATA(insert ( 2119 float4larger float4larger - 623 700 700 _null_ _null_ )); -DATA(insert ( 2120 float8larger float8larger - 674 701 701 _null_ _null_ )); -DATA(insert ( 2121 int4larger int4larger - 563 702 702 _null_ _null_ )); -DATA(insert ( 2122 date_larger date_larger - 1097 1082 1082 _null_ _null_ )); -DATA(insert ( 2123 time_larger time_larger - 1112 1083 1083 _null_ _null_ )); -DATA(insert ( 2124 timetz_larger timetz_larger - 1554 1266 1266 _null_ _null_ )); -DATA(insert ( 2125 cashlarger cashlarger - 903 790 790 _null_ _null_ )); -DATA(insert ( 2126 timestamp_larger timestamp_larger - 2064 1114 1114 _null_ _null_ )); -DATA(insert ( 2127 timestamptz_larger timestamptz_larger - 1324 1184 1184 _null_ _null_ )); -DATA(insert ( 2128 interval_larger interval_larger - 1334 1186 1186 _null_ _null_ )); -DATA(insert ( 2129 text_larger text_larger - 666 25 25 _null_ _null_ )); -DATA(insert ( 2130 numeric_larger numeric_larger - 1756 1700 1700 _null_ _null_ )); -DATA(insert ( 2050 array_larger array_larger - 1073 2277 2277 _null_ _null_ )); -DATA(insert ( 2244 bpchar_larger bpchar_larger - 1060 1042 1042 _null_ _null_ )); -DATA(insert ( 2797 tidlarger tidlarger - 2800 27 27 _null_ _null_ )); -DATA(insert ( 3526 enum_larger enum_larger - 3519 3500 3500 _null_ _null_ )); +DATA(insert ( 2115 int8larger int8larger - 413 20 _null_ _null_ )); +DATA(insert ( 2116 int4larger int4larger - 521 23 _null_ _null_ )); +DATA(insert ( 2117 int2larger int2larger - 520 21 _null_ _null_ )); +DATA(insert ( 2118 oidlarger oidlarger - 610 26 _null_ _null_ )); +DATA(insert ( 2119 float4larger float4larger - 623 700 _null_ _null_ )); +DATA(insert ( 2120 float8larger float8larger - 674 701 _null_ _null_ )); +DATA(insert ( 2121 int4larger int4larger - 563 702 _null_ _null_ )); +DATA(insert ( 2122 date_larger date_larger - 1097 1082 _null_ _null_ )); +DATA(insert ( 2123 time_larger time_larger - 1112 1083 _null_ _null_ )); +DATA(insert ( 2124 timetz_larger timetz_larger - 1554 1266 _null_ _null_ )); +DATA(insert ( 2125 cashlarger cashlarger - 903 790 _null_ _null_ )); +DATA(insert ( 2126 timestamp_larger timestamp_larger - 2064 1114 _null_ _null_ )); +DATA(insert ( 2127 timestamptz_larger timestamptz_larger - 1324 1184 _null_ _null_ )); +DATA(insert ( 2128 interval_larger interval_larger - 1334 1186 _null_ _null_ )); +DATA(insert ( 2129 text_larger text_larger - 666 25 _null_ _null_ )); +DATA(insert ( 2130 numeric_larger numeric_larger - 1756 1700 _null_ _null_ )); +DATA(insert ( 2050 array_larger array_larger - 1073 2277 _null_ _null_ )); +DATA(insert ( 2244 bpchar_larger bpchar_larger - 1060 1042 _null_ _null_ )); +DATA(insert ( 2797 tidlarger tidlarger - 2800 27 _null_ _null_ )); +DATA(insert ( 3526 enum_larger enum_larger - 3519 3500 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2115 int8larger - 413 20 _null_ )); @@ -194,26 +191,26 @@ DATA(insert ( 3526 enum_larger enum_larger - 3519 3500 3500 _null_ _null_ ) /* min */ #ifdef PGXC -DATA(insert ( 2131 int8smaller int8smaller - 412 20 20 _null_ _null_ )); -DATA(insert ( 2132 int4smaller int4smaller - 97 23 23 _null_ _null_ )); -DATA(insert ( 2133 int2smaller int2smaller - 95 21 21 _null_ _null_ )); -DATA(insert ( 2134 oidsmaller oidsmaller - 609 26 26 _null_ _null_ )); -DATA(insert ( 2135 float4smaller float4smaller - 622 700 700 _null_ _null_ )); -DATA(insert ( 2136 float8smaller float8smaller - 672 701 701 _null_ _null_ )); -DATA(insert ( 2137 int4smaller int4smaller - 562 702 702 _null_ _null_ )); -DATA(insert ( 2138 date_smaller date_smaller - 1095 1082 1082 _null_ _null_ )); -DATA(insert ( 2139 time_smaller time_smaller - 1110 1083 1083 _null_ _null_ )); -DATA(insert ( 2140 timetz_smaller timetz_smaller - 1552 1266 1266 _null_ _null_ )); -DATA(insert ( 2141 cashsmaller cashsmaller - 902 790 790 _null_ _null_ )); -DATA(insert ( 2142 timestamp_smaller timestamp_smaller - 2062 1114 1114 _null_ _null_ )); -DATA(insert ( 2143 timestamptz_smaller timestamptz_smaller - 1322 1184 1184 _null_ _null_ )); -DATA(insert ( 2144 interval_smaller interval_smaller - 1332 1186 1186 _null_ _null_ )); -DATA(insert ( 2145 text_smaller text_smaller - 664 25 25 _null_ _null_ )); -DATA(insert ( 2146 numeric_smaller numeric_smaller - 1754 1700 1700 _null_ _null_ )); -DATA(insert ( 2051 array_smaller array_smaller - 1072 2277 2277 _null_ _null_ )); -DATA(insert ( 2245 bpchar_smaller bpchar_smaller - 1058 1042 1042 _null_ _null_ )); -DATA(insert ( 2798 tidsmaller tidsmaller - 2799 27 27 _null_ _null_ )); -DATA(insert ( 3527 enum_smaller enum_smaller - 3518 3500 3500 _null_ _null_ )); +DATA(insert ( 2131 int8smaller int8smaller - 412 20 _null_ _null_ )); +DATA(insert ( 2132 int4smaller int4smaller - 97 23 _null_ _null_ )); +DATA(insert ( 2133 int2smaller int2smaller - 95 21 _null_ _null_ )); +DATA(insert ( 2134 oidsmaller oidsmaller - 609 26 _null_ _null_ )); +DATA(insert ( 2135 float4smaller float4smaller - 622 700 _null_ _null_ )); +DATA(insert ( 2136 float8smaller float8smaller - 672 701 _null_ _null_ )); +DATA(insert ( 2137 int4smaller int4smaller - 562 702 _null_ _null_ )); +DATA(insert ( 2138 date_smaller date_smaller - 1095 1082 _null_ _null_ )); +DATA(insert ( 2139 time_smaller time_smaller - 1110 1083 _null_ _null_ )); +DATA(insert ( 2140 timetz_smaller timetz_smaller - 1552 1266 _null_ _null_ )); +DATA(insert ( 2141 cashsmaller cashsmaller - 902 790 _null_ _null_ )); +DATA(insert ( 2142 timestamp_smaller timestamp_smaller - 2062 1114 _null_ _null_ )); +DATA(insert ( 2143 timestamptz_smaller timestamptz_smaller - 1322 1184 _null_ _null_ )); +DATA(insert ( 2144 interval_smaller interval_smaller - 1332 1186 _null_ _null_ )); +DATA(insert ( 2145 text_smaller text_smaller - 664 25 _null_ _null_ )); +DATA(insert ( 2146 numeric_smaller numeric_smaller - 1754 1700 _null_ _null_ )); +DATA(insert ( 2051 array_smaller array_smaller - 1072 2277 _null_ _null_ )); +DATA(insert ( 2245 bpchar_smaller bpchar_smaller - 1058 1042 _null_ _null_ )); +DATA(insert ( 2798 tidsmaller tidsmaller - 2799 27 _null_ _null_ )); +DATA(insert ( 3527 enum_smaller enum_smaller - 3518 3500 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2131 int8smaller - 412 20 _null_ )); @@ -241,8 +238,8 @@ DATA(insert ( 3527 enum_smaller enum_smaller - 3518 3500 3500 _null_ _null_ ) /* count */ /* Final function is data type conversion function numeric_int8 is refernced by OID because of ambiguous defininition in pg_proc */ #ifdef PGXC -DATA(insert ( 2147 int8inc_any int8_sum_to_int8 - 0 20 20 "0" "0" )); -DATA(insert ( 2803 int8inc int8_sum_to_int8 - 0 20 20 "0" "0" )); +DATA(insert ( 2147 int8inc_any int8_sum_to_int8 - 0 20 "0" "0" )); +DATA(insert ( 2803 int8inc int8_sum_to_int8 - 0 20 "0" "0" )); #endif #ifdef PGXC //DATA(insert ( 2147 int8inc_any - 0 20 "0" )); @@ -251,12 +248,12 @@ DATA(insert ( 2803 int8inc int8_sum_to_int8 - 0 20 20 "0" "0" )); /* var_pop */ #ifdef PGXC -DATA(insert ( 2718 int8_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2719 int4_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2720 int2_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2721 float4_accum float8_collect float8_var_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2722 float8_accum float8_collect float8_var_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2723 numeric_accum numeric_collect numeric_var_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2718 int8_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2719 int4_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2720 int2_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2721 float4_accum float8_collect float8_var_pop 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2722 float8_accum float8_collect float8_var_pop 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2723 numeric_accum numeric_collect numeric_var_pop 0 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2718 int8_accum numeric_var_pop 0 1231 "{0,0,0}" )); @@ -269,12 +266,12 @@ DATA(insert ( 2723 numeric_accum numeric_collect numeric_var_pop 0 1231 1231 "{ /* var_samp */ #ifdef PGXC -DATA(insert ( 2641 int8_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2642 int4_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2643 int2_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2644 float4_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2645 float8_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2646 numeric_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2641 int8_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2642 int4_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2643 int2_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2644 float4_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2645 float8_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2646 numeric_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2641 int8_accum numeric_var_samp 0 1231 "{0,0,0}" )); @@ -287,12 +284,12 @@ DATA(insert ( 2646 numeric_accum numeric_collect numeric_var_samp 0 1231 1231 "{ /* variance: historical Postgres syntax for var_samp */ #ifdef PGXC -DATA(insert ( 2148 int8_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2149 int4_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2150 int2_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2151 float4_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2152 float8_accum float8_collect float8_var_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2153 numeric_accum numeric_collect numeric_var_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2148 int8_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2149 int4_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2150 int2_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2151 float4_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2152 float8_accum float8_collect float8_var_samp 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2153 numeric_accum numeric_collect numeric_var_samp 0 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2148 int8_accum numeric_var_samp 0 1231 "{0,0,0}" )); @@ -305,12 +302,12 @@ DATA(insert ( 2153 numeric_accum numeric_collect numeric_var_samp 0 1231 1231 "{ /* stddev_pop */ #ifdef PGXC -DATA(insert ( 2724 int8_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2725 int4_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2726 int2_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2727 float4_accum float8_collect float8_stddev_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2728 float8_accum float8_collect float8_stddev_pop 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2729 numeric_accum numeric_collect numeric_stddev_pop 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2724 int8_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2725 int4_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2726 int2_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2727 float4_accum float8_collect float8_stddev_pop 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2728 float8_accum float8_collect float8_stddev_pop 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2729 numeric_accum numeric_collect numeric_stddev_pop 0 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2724 int8_accum numeric_stddev_pop 0 1231 "{0,0,0}" )); @@ -323,12 +320,12 @@ DATA(insert ( 2729 numeric_accum numeric_collect numeric_stddev_pop 0 1231 1231 /* stddev_samp */ #ifdef PGXC -DATA(insert ( 2712 int8_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2713 int4_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2714 int2_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2715 float4_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2716 float8_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2717 numeric_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2712 int8_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2713 int4_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2714 int2_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2715 float4_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2716 float8_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2717 numeric_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2712 int8_accum numeric_stddev_samp 0 1231 "{0,0,0}" )); @@ -341,12 +338,12 @@ DATA(insert ( 2717 numeric_accum numeric_collect numeric_stddev_samp 0 1231 1231 /* stddev: historical Postgres syntax for stddev_samp */ #ifdef PGXC -DATA(insert ( 2154 int8_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2155 int4_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2156 int2_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2157 float4_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2158 float8_accum float8_collect float8_stddev_samp 0 1022 1022 "{0,0,0}" "{0,0,0}" )); -DATA(insert ( 2159 numeric_accum numeric_collect numeric_stddev_samp 0 1231 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2154 int8_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2155 int4_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2156 int2_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2157 float4_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2158 float8_accum float8_collect float8_stddev_samp 0 1022 "{0,0,0}" "{0,0,0}" )); +DATA(insert ( 2159 numeric_accum numeric_collect numeric_stddev_samp 0 1231 "{0,0,0}" "{0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2154 int8_accum numeric_stddev_samp 0 1231 "{0,0,0}" )); @@ -359,18 +356,18 @@ DATA(insert ( 2159 numeric_accum numeric_collect numeric_stddev_samp 0 1231 1231 /* SQL2003 binary regression aggregates */ #ifdef PGXC -DATA(insert ( 2818 int8inc_float8_float8 int8_sum_to_int8 - 0 20 20 "0" _null_ )); -DATA(insert ( 2819 float8_regr_accum float8_regr_collect float8_regr_sxx 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2820 float8_regr_accum float8_regr_collect float8_regr_syy 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2821 float8_regr_accum float8_regr_collect float8_regr_sxy 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2822 float8_regr_accum float8_regr_collect float8_regr_avgx 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2823 float8_regr_accum float8_regr_collect float8_regr_avgy 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2824 float8_regr_accum float8_regr_collect float8_regr_r2 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2825 float8_regr_accum float8_regr_collect float8_regr_slope 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2826 float8_regr_accum float8_regr_collect float8_regr_intercept 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2827 float8_regr_accum float8_regr_collect float8_covar_pop 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2828 float8_regr_accum float8_regr_collect float8_covar_samp 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); -DATA(insert ( 2829 float8_regr_accum float8_regr_collect float8_corr 0 1022 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2818 int8inc_float8_float8 int8_sum_to_int8 - 0 20 "0" _null_ )); +DATA(insert ( 2819 float8_regr_accum float8_regr_collect float8_regr_sxx 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2820 float8_regr_accum float8_regr_collect float8_regr_syy 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2821 float8_regr_accum float8_regr_collect float8_regr_sxy 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2822 float8_regr_accum float8_regr_collect float8_regr_avgx 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2823 float8_regr_accum float8_regr_collect float8_regr_avgy 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2824 float8_regr_accum float8_regr_collect float8_regr_r2 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2825 float8_regr_accum float8_regr_collect float8_regr_slope 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2826 float8_regr_accum float8_regr_collect float8_regr_intercept 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2827 float8_regr_accum float8_regr_collect float8_covar_pop 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2828 float8_regr_accum float8_regr_collect float8_covar_samp 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); +DATA(insert ( 2829 float8_regr_accum float8_regr_collect float8_corr 0 1022 "{0,0,0,0,0,0}" "{0,0,0,0,0,0}" )); #endif #ifdef PGXC //DATA(insert ( 2818 int8inc_float8_float8 - 0 20 "0" )); @@ -389,9 +386,9 @@ DATA(insert ( 2829 float8_regr_accum float8_regr_collect float8_corr 0 1022 1 /* boolean-and and boolean-or */ #ifdef PGXC -DATA(insert ( 2517 booland_statefunc booland_statefunc - 0 16 16 _null_ _null_ )); -DATA(insert ( 2518 boolor_statefunc boolor_statefunc - 0 16 16 _null_ _null_ )); -DATA(insert ( 2519 booland_statefunc booland_statefunc - 0 16 16 _null_ _null_ )); +DATA(insert ( 2517 booland_statefunc booland_statefunc - 0 16 _null_ _null_ )); +DATA(insert ( 2518 boolor_statefunc boolor_statefunc - 0 16 _null_ _null_ )); +DATA(insert ( 2519 booland_statefunc booland_statefunc - 0 16 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2517 booland_statefunc - 0 16 _null_ )); @@ -401,14 +398,14 @@ DATA(insert ( 2519 booland_statefunc booland_statefunc - 0 16 16 _null_ _null /* bitwise integer */ #ifdef PGXC -DATA(insert ( 2236 int2and int2and - 0 21 21 _null_ _null_ )); -DATA(insert ( 2237 int2or int2or - 0 21 21 _null_ _null_ )); -DATA(insert ( 2238 int4and int4and - 0 23 23 _null_ _null_ )); -DATA(insert ( 2239 int4or int4or - 0 23 23 _null_ _null_ )); -DATA(insert ( 2240 int8and int8and - 0 20 20 _null_ _null_ )); -DATA(insert ( 2241 int8or int8or - 0 20 20 _null_ _null_ )); -DATA(insert ( 2242 bitand bitand - 0 1560 1560 _null_ _null_ )); -DATA(insert ( 2243 bitor bitor - 0 1560 1560 _null_ _null_ )); +DATA(insert ( 2236 int2and int2and - 0 21 _null_ _null_ )); +DATA(insert ( 2237 int2or int2or - 0 21 _null_ _null_ )); +DATA(insert ( 2238 int4and int4and - 0 23 _null_ _null_ )); +DATA(insert ( 2239 int4or int4or - 0 23 _null_ _null_ )); +DATA(insert ( 2240 int8and int8and - 0 20 _null_ _null_ )); +DATA(insert ( 2241 int8or int8or - 0 20 _null_ _null_ )); +DATA(insert ( 2242 bitand bitand - 0 1560 _null_ _null_ )); +DATA(insert ( 2243 bitor bitor - 0 1560 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2236 int2and - 0 21 _null_ )); @@ -423,7 +420,7 @@ DATA(insert ( 2243 bitor bitor - 0 1560 1560 _null_ _null_ )); /* xml */ #ifdef PGXC -DATA(insert ( 2901 xmlconcat2 xmlconcat2 - 0 142 142 _null_ _null_ )); +DATA(insert ( 2901 xmlconcat2 xmlconcat2 - 0 142 _null_ _null_ )); #endif #ifdef PGXC //DATA(insert ( 2901 xmlconcat2 - 0 142 _null_ )); @@ -459,7 +456,6 @@ extern void AggregateCreate(const char *aggName, List *aggsortopName, Oid aggTransType, #ifdef PGXC - Oid aggCollectType, const char *agginitval, const char *agginitcollect); #else ----------------------------------------------------------------------- Summary of changes: src/backend/catalog/pg_aggregate.c | 21 +-- src/backend/commands/aggregatecmds.c | 5 +- src/backend/executor/nodeAgg.c | 91 ++---------- src/include/catalog/pg_aggregate.h | 250 +++++++++++++++++----------------- 4 files changed, 147 insertions(+), 220 deletions(-) hooks/post-receive -- Postgres-XC |
From: Abbas B. <ga...@us...> - 2011-06-24 18:01:56
|
Project "Postgres-XC". The branch, master has been updated via ff7be6e332b36fc7aad99876bf107e258264a7f1 (commit) from d56caa5e2ac517b83595586987794337c9dea357 (commit) - Log ----------------------------------------------------------------- commit ff7be6e332b36fc7aad99876bf107e258264a7f1 Author: Abbas <abb...@en...> Date: Fri Jun 24 22:59:57 2011 +0500 This patch adds a system in XC to cancel a running query, and flush network buffers of any results data nodes might have sent before cancelling the query. This was required to fix certain issues where coordinator encounters an error while processing rows from data nodes and quits row processing. It then issues a new query and finds an old row description in the network buffer. This can and was crashing the server. To cancel a query a new pooler command 'h' is added. This command is sent to the pooler by the coordinator and the pooler issues PQcancel to the respective data nodes. Cancel request is sent every time coordinator raises an error of level more than ERROR. This commit fixes bug 3306801 diff --git a/src/backend/access/common/heaptuple.c b/src/backend/access/common/heaptuple.c index c0103b8..d34f002 100644 --- a/src/backend/access/common/heaptuple.c +++ b/src/backend/access/common/heaptuple.c @@ -1130,7 +1130,7 @@ slot_deform_tuple(TupleTableSlot *slot, int natts) static void slot_deform_datarow(TupleTableSlot *slot) { - int attnum = slot->tts_tupleDescriptor->natts; + int attnum; int i; int col_count; char *cur = slot->tts_dataRow; @@ -1138,6 +1138,11 @@ slot_deform_datarow(TupleTableSlot *slot) uint16 n16; uint32 n32; + if (slot->tts_tupleDescriptor == NULL || slot->tts_dataRow == NULL) + return; + + attnum = slot->tts_tupleDescriptor->natts; + /* fastpath: exit if values already extracted */ if (slot->tts_nvalid == attnum) return; diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index ea30453..2f77f5e 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -1195,8 +1195,8 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, int handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) { - char *msg; - int msg_len; + char *msg; + int msg_len; char msg_type; bool suspended = false; @@ -1327,6 +1327,64 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) /* + * Has the data node sent Ready For Query + */ + +bool +is_data_node_ready(PGXCNodeHandle * conn) +{ + char *msg; + int msg_len; + char msg_type; + bool suspended = false; + + for (;;) + { + /* + * If we are in the process of shutting down, we + * may be rolling back, and the buffer may contain other messages. + * We want to avoid a procarray exception + * as well as an error stack overflow. + */ + if (proc_exit_inprogress) + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; + + /* don't read from from the connection if there is a fatal error */ + if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) + return true; + + /* No data available, exit */ + if (!HAS_MESSAGE_BUFFERED(conn)) + return false; + + msg_type = get_message(conn, &msg_len, &msg); + switch (msg_type) + { + case 's': /* PortalSuspended */ + suspended = true; + break; + + case 'Z': /* ReadyForQuery */ + { + /* + * Return result depends on previous connection state. + * If it was PORTAL_SUSPENDED coordinator want to send down + * another EXECUTE to fetch more rows, otherwise it is done + * with the connection + */ + int result = suspended ? RESPONSE_SUSPENDED : RESPONSE_COMPLETE; + conn->transaction_status = msg[0]; + conn->state = DN_CONNECTION_STATE_IDLE; + conn->combiner = NULL; + return true; + } + } + } + /* never happen, but keep compiler quiet */ + return false; +} + +/* * Send BEGIN command to the Datanodes or Coordinators and receive responses */ static int @@ -2453,7 +2511,7 @@ DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** if (bytes_needed > COPY_BUFFER_SIZE) { /* First look if data node has sent a error message */ - int read_status = pgxc_node_read_data(primary_handle); + int read_status = pgxc_node_read_data(primary_handle, true); if (read_status == EOF || read_status < 0) { add_error_message(primary_handle, "failed to read data from data node"); @@ -2514,7 +2572,7 @@ DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** int to_send = handle->outEnd; /* First look if data node has sent a error message */ - int read_status = pgxc_node_read_data(handle); + int read_status = pgxc_node_read_data(handle, true); if (read_status == EOF || read_status < 0) { add_error_message(handle, "failed to read data from data node"); @@ -2615,7 +2673,7 @@ DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE* if (handle_response(handle,combiner) == RESPONSE_EOF) { /* read some extra-data */ - read_status = pgxc_node_read_data(handle); + read_status = pgxc_node_read_data(handle, true); if (read_status < 0) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), @@ -2679,30 +2737,9 @@ DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_data_node, if (primary_handle) { + error = true; if (primary_handle->state == DN_CONNECTION_STATE_COPY_IN || primary_handle->state == DN_CONNECTION_STATE_COPY_OUT) - { - /* msgType + msgLen */ - if (ensure_out_buffer_capacity(primary_handle->outEnd + 1 + 4, primary_handle) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - - primary_handle->outBuffer[primary_handle->outEnd++] = 'c'; - memcpy(primary_handle->outBuffer + primary_handle->outEnd, &nLen, 4); - primary_handle->outEnd += 4; - - /* We need response right away, so send immediately */ - if (pgxc_node_flush(primary_handle) < 0) - { - error = true; - } - } - else - { - error = true; - } + error = DataNodeCopyEnd(primary_handle, false); combiner = CreateResponseCombiner(conn_count + 1, combine_type); error = (pgxc_node_receive_responses(1, &primary_handle, timeout, combiner) != 0) || error; @@ -2712,30 +2749,9 @@ DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_data_node, { PGXCNodeHandle *handle = connections[i]; + error = true; if (handle->state == DN_CONNECTION_STATE_COPY_IN || handle->state == DN_CONNECTION_STATE_COPY_OUT) - { - /* msgType + msgLen */ - if (ensure_out_buffer_capacity(handle->outEnd + 1 + 4, handle) != 0) - { - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - - handle->outBuffer[handle->outEnd++] = 'c'; - memcpy(handle->outBuffer + handle->outEnd, &nLen, 4); - handle->outEnd += 4; - - /* We need response right away, so send immediately */ - if (pgxc_node_flush(handle) < 0) - { - error = true; - } - } - else - { - error = true; - } + error = DataNodeCopyEnd(handle, false); } need_tran = !autocommit || primary_handle || conn_count > 1; @@ -2750,6 +2766,36 @@ DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_data_node, errmsg("Error while running COPY"))); } +/* + * End copy process on a connection + */ +bool +DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error) +{ + int nLen = htonl(4); + + if (handle == NULL) + return true; + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + 4, handle) != 0) + return true; + + if (is_error) + handle->outBuffer[handle->outEnd++] = 'f'; + else + handle->outBuffer[handle->outEnd++] = 'c'; + + memcpy(handle->outBuffer + handle->outEnd, &nLen, 4); + handle->outEnd += 4; + + /* We need response right away, so send immediately */ + if (pgxc_node_flush(handle) < 0) + return true; + + return false; +} + RemoteQueryState * ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) { @@ -3296,7 +3342,9 @@ do_query(RemoteQueryState *node) while (true) { int res; - pgxc_node_receive(1, &primaryconnection, NULL); + if (pgxc_node_receive(1, &primaryconnection, NULL)) + break; + res = handle_response(primaryconnection, node); if (res == RESPONSE_COMPLETE) break; @@ -4248,7 +4296,8 @@ ExecRemoteUtility(RemoteQuery *node) { int i = 0; - pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL); + if (pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL)) + break; /* * Handle input from the data nodes. * We do not expect data nodes returning tuples when running utility @@ -4296,7 +4345,9 @@ ExecRemoteUtility(RemoteQuery *node) { int i = 0; - pgxc_node_receive(co_conn_count, pgxc_connections->coord_handles, NULL); + if (pgxc_node_receive(co_conn_count, pgxc_connections->coord_handles, NULL)) + break; + while (i < co_conn_count) { int res = handle_response(pgxc_connections->coord_handles[i], remotestate); diff --git a/src/backend/pgxc/pool/pgxcnode.c b/src/backend/pgxc/pool/pgxcnode.c index a0b8da4..a2e90ce 100644 --- a/src/backend/pgxc/pool/pgxcnode.c +++ b/src/backend/pgxc/pool/pgxcnode.c @@ -20,6 +20,7 @@ #include <sys/select.h> #include <sys/time.h> #include <sys/types.h> +#include <sys/ioctl.h> #include <stdlib.h> #include <string.h> #include <unistd.h> @@ -279,21 +280,35 @@ pgxc_node_init(PGXCNodeHandle *handle, int sock, int nodenum) * Wait while at least one of specified connections has data available and read * the data into the buffer */ -int +bool pgxc_node_receive(const int conn_count, PGXCNodeHandle ** connections, struct timeval * timeout) { +#define ERROR_OCCURED true +#define NO_ERROR_OCCURED false int i, res_select, nfds = 0; - fd_set readfds; + fd_set readfds; + bool is_msg_buffered; FD_ZERO(&readfds); + + is_msg_buffered = false; + for (i = 0; i < conn_count; i++) + { + /* If connection has a buffered message */ + if (HAS_MESSAGE_BUFFERED(connections[i])) + { + is_msg_buffered = true; + break; + } + } + for (i = 0; i < conn_count; i++) { /* If connection finished sending do not wait input from it */ - if (connections[i]->state == DN_CONNECTION_STATE_IDLE - || HAS_MESSAGE_BUFFERED(connections[i])) + if (connections[i]->state == DN_CONNECTION_STATE_IDLE || HAS_MESSAGE_BUFFERED(connections[i])) continue; /* prepare select params */ @@ -313,7 +328,11 @@ pgxc_node_receive(const int conn_count, * Return if we do not have connections to receive input */ if (nfds == 0) - return 0; + { + if (is_msg_buffered) + return NO_ERROR_OCCURED; + return ERROR_OCCURED; + } retry: res_select = select(nfds + 1, &readfds, NULL, NULL, timeout); @@ -328,14 +347,16 @@ retry: elog(WARNING, "select() bad file descriptor set"); } elog(WARNING, "select() error: %d", errno); - return errno; + if (errno) + return ERROR_OCCURED; + return NO_ERROR_OCCURED; } if (res_select == 0) { /* Handle timeout */ elog(WARNING, "timeout while waiting for response"); - return EOF; + return ERROR_OCCURED; } /* read data */ @@ -345,7 +366,7 @@ retry: if (FD_ISSET(conn->sock, &readfds)) { - int read_status = pgxc_node_read_data(conn); + int read_status = pgxc_node_read_data(conn, true); if (read_status == EOF || read_status < 0) { @@ -354,26 +375,46 @@ retry: add_error_message(conn, "unexpected EOF on datanode connection"); elog(WARNING, "unexpected EOF on datanode connection"); /* Should we read from the other connections before returning? */ - return EOF; + return ERROR_OCCURED; } } } - return 0; + return NO_ERROR_OCCURED; } +/* + * Is there any data enqueued in the TCP input buffer waiting + * to be read sent by the PGXC node connection + */ + +int +pgxc_node_is_data_enqueued(PGXCNodeHandle *conn) +{ + int ret; + int enqueued; + + if (conn->sock < 0) + return 0; + ret = ioctl(conn->sock, FIONREAD, &enqueued); + if (ret != 0) + return 0; + + return enqueued; +} /* * Read up incoming messages from the PGXC node connection */ int -pgxc_node_read_data(PGXCNodeHandle *conn) +pgxc_node_read_data(PGXCNodeHandle *conn, bool close_if_error) { int someread = 0; int nread; if (conn->sock < 0) { - add_error_message(conn, "bad socket"); + if (close_if_error) + add_error_message(conn, "bad socket"); return EOF; } @@ -412,7 +453,8 @@ pgxc_node_read_data(PGXCNodeHandle *conn) */ if (conn->inSize - conn->inEnd < 100) { - add_error_message(conn, "can not allocate buffer"); + if (close_if_error) + add_error_message(conn, "can not allocate buffer"); return -1; } } @@ -424,7 +466,8 @@ retry: if (nread < 0) { - elog(DEBUG1, "dnrd errno = %d", errno); + if (close_if_error) + elog(DEBUG1, "dnrd errno = %d", errno); if (errno == EINTR) goto retry; /* Some systems return EAGAIN/EWOULDBLOCK for no data */ @@ -444,19 +487,22 @@ retry: * OK, we are getting a zero read even though select() says ready. This * means the connection has been closed. Cope. */ - add_error_message(conn, - "data node closed the connection unexpectedly\n" - "\tThis probably means the data node terminated abnormally\n" - "\tbefore or while processing the request.\n"); - conn->state = DN_CONNECTION_STATE_ERROR_FATAL; /* No more connection to - * backend */ - closesocket(conn->sock); - conn->sock = NO_SOCKET; - + if (close_if_error) + { + add_error_message(conn, + "data node closed the connection unexpectedly\n" + "\tThis probably means the data node terminated abnormally\n" + "\tbefore or while processing the request.\n"); + conn->state = DN_CONNECTION_STATE_ERROR_FATAL; /* No more connection to + * backend */ + closesocket(conn->sock); + conn->sock = NO_SOCKET; + } return -1; } #endif - add_error_message(conn, "could not receive data from server"); + if (close_if_error) + add_error_message(conn, "could not receive data from server"); return -1; } @@ -488,7 +534,8 @@ retry: if (nread == 0) { - elog(DEBUG1, "nread returned 0"); + if (close_if_error) + elog(DEBUG1, "nread returned 0"); return EOF; } @@ -661,6 +708,102 @@ release_handles(void) coord_count = 0; } +/* + * cancel a running query due to error while processing rows + */ +void +cancel_query(void) +{ + int i; + int dn_cancel[NumDataNodes]; + int co_cancel[NumCoords]; + int dn_count = 0; + int co_count = 0; + + if (datanode_count == 0 && coord_count == 0) + return; + + /* Collect Data Nodes handles */ + for (i = 0; i < NumDataNodes; i++) + { + PGXCNodeHandle *handle = &dn_handles[i]; + + if (handle->sock != NO_SOCKET) + { + if (handle->state == DN_CONNECTION_STATE_COPY_IN || handle->state == DN_CONNECTION_STATE_COPY_OUT) + { + DataNodeCopyEnd(handle, true); + } + else + { + if (handle->state != DN_CONNECTION_STATE_IDLE) + { + dn_cancel[dn_count++] = handle->nodenum; + } + } + } + } + + /* Collect Coordinator handles */ + for (i = 0; i < NumCoords; i++) + { + PGXCNodeHandle *handle = &co_handles[i]; + + if (handle->sock != NO_SOCKET) + { + if (handle->state == DN_CONNECTION_STATE_COPY_IN || handle->state == DN_CONNECTION_STATE_COPY_OUT) + { + DataNodeCopyEnd(handle, true); + } + else + { + if (handle->state != DN_CONNECTION_STATE_IDLE) + { + co_cancel[dn_count++] = handle->nodenum; + } + } + } + } + + PoolManagerCancelQuery(dn_count, dn_cancel, co_count, co_cancel); +} + +/* + * This method won't return until all network buffers are empty + * To ensure all data in all network buffers is read and wasted + */ +void +clear_all_data(void) +{ + int i; + + if (datanode_count == 0 && coord_count == 0) + return; + + /* Collect Data Nodes handles */ + for (i = 0; i < NumDataNodes; i++) + { + PGXCNodeHandle *handle = &dn_handles[i]; + + if (handle->sock != NO_SOCKET && handle->state != DN_CONNECTION_STATE_IDLE) + { + pgxc_node_flush_read(handle); + handle->state = DN_CONNECTION_STATE_IDLE; + } + } + + /* Collect Coordinator handles */ + for (i = 0; i < NumCoords; i++) + { + PGXCNodeHandle *handle = &co_handles[i]; + + if (handle->sock != NO_SOCKET && handle->state != DN_CONNECTION_STATE_IDLE) + { + pgxc_node_flush_read(handle); + handle->state = DN_CONNECTION_STATE_IDLE; + } + } +} /* * Ensure specified amount of data can fit to the incoming buffer and @@ -1224,6 +1367,31 @@ pgxc_node_flush(PGXCNodeHandle *handle) } /* + * This method won't return until network buffer is empty or error occurs + * To ensure all data in network buffers is read and wasted + */ +void +pgxc_node_flush_read(PGXCNodeHandle *handle) +{ + bool is_ready; + int read_result; + + if (handle == NULL) + return; + + while(true) + { + is_ready = is_data_node_ready(handle); + if (is_ready == true) + break; + + read_result = pgxc_node_read_data(handle, false); + if (read_result < 0) + break; + } +} + +/* * Send specified statement down to the PGXC node */ int diff --git a/src/backend/pgxc/pool/poolcomm.c b/src/backend/pgxc/pool/poolcomm.c index 79a3776..22dc813 100644 --- a/src/backend/pgxc/pool/poolcomm.c +++ b/src/backend/pgxc/pool/poolcomm.c @@ -435,9 +435,7 @@ pool_flush(PoolPort *port) * If shutting down already, do not call. */ if (!proc_exit_inprogress) - ereport(ERROR, - (errcode_for_socket_access(), - errmsg("could not send data to client: %m"))); + return 0; } /* diff --git a/src/backend/pgxc/pool/poolmgr.c b/src/backend/pgxc/pool/poolmgr.c index 1b2c4bf..463bd5a 100644 --- a/src/backend/pgxc/pool/poolmgr.c +++ b/src/backend/pgxc/pool/poolmgr.c @@ -104,6 +104,7 @@ static DatabasePool *find_database_pool_to_clean(const char *database, List *co_list); static DatabasePool *remove_database_pool(const char *database, const char *user_name); static int *agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist); +static int cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist); static PGXCNodePoolSlot *acquire_connection(DatabasePool *dbPool, int node, char client_conn_type); static void agent_release_connections(PoolAgent *agent, List *dn_discard, List *co_discard); static void agent_reset_params(PoolAgent *agent, List *dn_list, List *co_list); @@ -878,17 +879,17 @@ agent_handle_input(PoolAgent * agent, StringInfo s) */ for (;;) { - const char *database = NULL; - const char *user_name = NULL; - const char *set_command; + const char *database = NULL; + const char *user_name = NULL; + const char *set_command; bool is_local; - int datanodecount; - int coordcount; - List *datanodelist = NIL; - List *coordlist = NIL; - int *fds; - int *pids; - int i, len, res; + int datanodecount; + int coordcount; + List *datanodelist = NIL; + List *coordlist = NIL; + int *fds; + int *pids; + int i, len, res; /* * During a pool cleaning, Abort, Connect and Get Connections messages @@ -1001,6 +1002,32 @@ agent_handle_input(PoolAgent * agent, StringInfo s) if (fds) pfree(fds); break; + + case 'h': /* Cancel SQL Command in progress on specified connections */ + /* + * Length of message is caused by: + * - Message header = 4bytes + * - List of datanodes = NumDataNodes * 4bytes (max) + * - List of coordinators = NumCoords * 4bytes (max) + * - Number of Datanodes sent = 4bytes + * - Number of Coordinators sent = 4bytes + */ + pool_getmessage(&agent->port, s, 4 * NumDataNodes + 4 * NumCoords + 12); + datanodecount = pq_getmsgint(s, 4); + for (i = 0; i < datanodecount; i++) + datanodelist = lappend_int(datanodelist, pq_getmsgint(s, 4)); + coordcount = pq_getmsgint(s, 4); + /* It is possible that no Coordinators are involved in the transaction */ + for (i = 0; i < coordcount; i++) + coordlist = lappend_int(coordlist, pq_getmsgint(s, 4)); + pq_getmsgend(s); + + cancel_query_on_connections(agent, datanodelist, coordlist); + list_free(datanodelist); + list_free(coordlist); + + break; + case 'r': /* RELEASE CONNECTIONS */ pool_getmessage(&agent->port, s, 4 * NumDataNodes + 4 * NumCoords + 12); datanodecount = pq_getmsgint(s, 4); @@ -1245,6 +1272,61 @@ agent_acquire_connections(PoolAgent *agent, List *datanodelist, List *coordlist) return result; } +/* + * Cancel query + */ +static int +cancel_query_on_connections(PoolAgent *agent, List *datanodelist, List *coordlist) +{ + int i; + ListCell *nodelist_item; + char errbuf[256]; + int nCount; + bool bRet; + + nCount = 0; + + if (agent == NULL) + return nCount; + + /* Send cancel on Data nodes first */ + foreach(nodelist_item, datanodelist) + { + int node = lfirst_int(nodelist_item); + + if(node <= 0 || node > NumDataNodes) + continue; + + if (agent->dn_connections == NULL) + break; + + bRet = PQcancel((PGcancel *) agent->dn_connections[node - 1]->xc_cancelConn, errbuf, sizeof(errbuf)); + if (bRet != false) + { + nCount++; + } + } + + /* Send cancel to Coordinators too, e.g. if DDL was in progress */ + foreach(nodelist_item, coordlist) + { + int node = lfirst_int(nodelist_item); + + if(node <= 0 || node > NumDataNodes) + continue; + + if (agent->coord_connections == NULL) + break; + + bRet = PQcancel((PGcancel *) agent->coord_connections[node - 1]->xc_cancelConn, errbuf, sizeof(errbuf)); + if (bRet != false) + { + nCount++; + } + } + + return nCount; +} /* * Return connections back to the pool @@ -1262,6 +1344,9 @@ PoolManagerReleaseConnections(int dn_ndisc, int* dn_discard, int co_ndisc, int* Assert(Handle); + if (dn_ndisc == 0 && co_ndisc == 0) + return; + /* Insert the list of Datanodes in buffer */ n32 = htonl((uint32) dn_ndisc); buf[0] = n32; @@ -1290,6 +1375,52 @@ PoolManagerReleaseConnections(int dn_ndisc, int* dn_discard, int co_ndisc, int* pool_flush(&Handle->port); } +/* + * Cancel Query + */ +void +PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list) +{ + uint32 n32; + /* + * Buffer contains the list of both Coordinator and Datanodes, as well + * as the number of connections + */ + uint32 buf[2 + dn_count + co_count]; + int i; + + if (Handle == NULL || dn_list == NULL || co_list == NULL) + return; + + if (dn_count == 0 && co_count == 0) + return; + + /* Insert the list of Datanodes in buffer */ + n32 = htonl((uint32) dn_count); + buf[0] = n32; + + for (i = 0; i < dn_count;) + { + n32 = htonl((uint32) dn_list[i++]); + buf[i] = n32; + } + + /* Insert the list of Coordinators in buffer */ + n32 = htonl((uint32) co_count); + buf[dn_count + 1] = n32; + + /* Not necessary to send to pooler a request if there is no Coordinator */ + if (co_count != 0) + { + for (i = dn_count + 1; i < (dn_count + co_count + 1);) + { + n32 = htonl((uint32) co_list[i - (dn_count + 1)]); + buf[++i] = n32; + } + } + pool_putmessage(&Handle->port, 'h', (char *) buf, (2 + dn_count + co_count) * sizeof(uint32)); + pool_flush(&Handle->port); +} /* * Release connections for Datanodes and Coordinators @@ -1950,6 +2081,8 @@ grow_pool(DatabasePool * dbPool, int index, char client_conn_type) break; } + slot->xc_cancelConn = PQgetCancel(slot->conn); + /* Insert at the end of the pool */ nodePool->slot[(nodePool->freeSize)++] = slot; @@ -1968,6 +2101,7 @@ grow_pool(DatabasePool * dbPool, int index, char client_conn_type) static void destroy_slot(PGXCNodePoolSlot *slot) { + PQfreeCancel(slot->xc_cancelConn); PGXCNodeClose(slot->conn); pfree(slot); } diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index b2fab35..60e9cac 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -71,6 +71,9 @@ #include "utils/guc.h" #include "utils/memutils.h" #include "utils/ps_status.h" +#ifdef PGXC +#include "pgxc/pgxc.h" +#endif #undef _ @@ -221,6 +224,13 @@ errstart(int elevel, const char *filename, int lineno, */ if (elevel >= ERROR) { +#ifdef PGXC + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + { + cancel_query(); + clear_all_data(); + } +#endif /* * If we are inside a critical section, all errors become PANIC * errors. See miscadmin.h. @@ -1121,6 +1131,14 @@ elog_finish(int elevel, const char *fmt,...) CHECK_STACK_DEPTH(); +#ifdef PGXC + if (elevel >= ERROR && IS_PGXC_COORDINATOR && !IsConnFromCoord()) + { + cancel_query(); + clear_all_data(); + } +#endif + /* * Do errstart() to see if we actually want to report the message. */ diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 7cdb0f6..d864470 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -234,5 +234,4 @@ extern int xactGetCommittedChildren(TransactionId **ptr); extern void xact_redo(XLogRecPtr lsn, XLogRecord *record); extern void xact_desc(StringInfo buf, uint8 xl_info, char *rec); - #endif /* XACT_H */ diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index c2fe884..48d23ca 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -141,6 +141,7 @@ extern PGXCNodeHandle** DataNodeCopyBegin(const char *query, List *nodelist, Sna extern int DataNodeCopyIn(char *data_row, int len, ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections); extern uint64 DataNodeCopyOut(ExecNodes *exec_nodes, PGXCNodeHandle** copy_connections, FILE* copy_file); extern void DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int primary_data_node, CombineType combine_type); +extern bool DataNodeCopyEnd(PGXCNodeHandle *handle, bool is_error); extern int DataNodeCopyInBinaryForAll(char *msg_buf, int len, PGXCNodeHandle** copy_connections); extern int ExecCountSlotsRemoteQuery(RemoteQuery *node); @@ -150,10 +151,8 @@ extern void ExecEndRemoteQuery(RemoteQueryState *step); extern void ExecRemoteUtility(RemoteQuery *node); extern int handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner); -#ifdef PGXC -extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body, - size_t len); -#endif +extern bool is_data_node_ready(PGXCNodeHandle * conn); +extern void HandleCmdComplete(CmdType commandType, CombineTag *combine, const char *msg_body, size_t len); extern bool FetchTuple(RemoteQueryState *combiner, TupleTableSlot *slot); extern void BufferConnection(PGXCNodeHandle *conn); diff --git a/src/include/pgxc/pgxcnode.h b/src/include/pgxc/pgxcnode.h index 8f1eb54..4b66a75 100644 --- a/src/include/pgxc/pgxcnode.h +++ b/src/include/pgxc/pgxcnode.h @@ -28,6 +28,7 @@ /* Connection to data node maintained by Pool Manager */ typedef struct PGconn NODE_CONNECTION; +typedef struct PGcancel NODE_CANCEL; /* Helper structure to access data node from Session */ typedef enum @@ -105,6 +106,9 @@ extern void PGXCNodeCleanAndRelease(int code, Datum arg); extern PGXCNodeAllHandles *get_handles(List *datanodelist, List *coordlist, bool is_query_coord_only); extern void release_handles(void); +extern void cancel_query(void); +extern void clear_all_data(void); + extern int get_transaction_nodes(PGXCNodeHandle ** connections, char client_conn_type, @@ -130,11 +134,14 @@ extern int pgxc_node_send_gxid(PGXCNodeHandle * handle, GlobalTransactionId gxid extern int pgxc_node_send_snapshot(PGXCNodeHandle * handle, Snapshot snapshot); extern int pgxc_node_send_timestamp(PGXCNodeHandle * handle, TimestampTz timestamp); -extern int pgxc_node_receive(const int conn_count, +extern bool pgxc_node_receive(const int conn_count, PGXCNodeHandle ** connections, struct timeval * timeout); -extern int pgxc_node_read_data(PGXCNodeHandle * conn); +extern int pgxc_node_read_data(PGXCNodeHandle * conn, bool close_if_error); +extern int pgxc_node_is_data_enqueued(PGXCNodeHandle *conn); + extern int send_some(PGXCNodeHandle * handle, int len); extern int pgxc_node_flush(PGXCNodeHandle *handle); +extern void pgxc_node_flush_read(PGXCNodeHandle *handle); extern int pgxc_all_handles_send_gxid(PGXCNodeAllHandles *pgxc_handles, GlobalTransactionId gxid, bool stop_at_error); extern int pgxc_all_handles_send_query(PGXCNodeAllHandles *pgxc_handles, const char *buffer, bool stop_at_error); diff --git a/src/include/pgxc/poolmgr.h b/src/include/pgxc/poolmgr.h index 4de9e4a..7939768 100644 --- a/src/include/pgxc/poolmgr.h +++ b/src/include/pgxc/poolmgr.h @@ -35,6 +35,7 @@ typedef struct { struct timeval released; NODE_CONNECTION *conn; + NODE_CANCEL *xc_cancelConn; } PGXCNodePoolSlot; /* Pool of connections to specified pgxc node */ @@ -149,4 +150,7 @@ extern int PoolManagerAbortTransactions(char *dbname, char *username, int **proc /* Return connections back to the pool, for both Coordinator and Datanode connections */ extern void PoolManagerReleaseConnections(int dn_ndisc, int* dn_discard, int co_ndisc, int* co_discard); +/* Cancel a running query on data nodes as well as on other coordinators */ +extern void PoolManagerCancelQuery(int dn_count, int* dn_list, int co_count, int* co_list); + #endif diff --git a/src/test/regress/expected/domain_1.out b/src/test/regress/expected/domain_1.out index 07808af..02f1556 100644 --- a/src/test/regress/expected/domain_1.out +++ b/src/test/regress/expected/domain_1.out @@ -48,8 +48,7 @@ ERROR: value too long for type character varying(5) INSERT INTO basictest values ('88', 'haha', 'short', '123.1212'); -- Truncate numeric -- Test copy COPY basictest (testvarchar) FROM stdin; -- fail -ERROR: value too long for type character varying(5) -CONTEXT: COPY basictest, line 1, column testvarchar: "notsoshorttext" +ERROR: Error while running COPY COPY basictest (testvarchar) FROM stdin; select * from basictest order by 1, 2, 3, 4; testint4 | testtext | testvarchar | testnumeric @@ -129,8 +128,7 @@ select testint4arr[1], testchar4arr[2:2] from domarrtest order by 1, 2; COPY domarrtest FROM stdin; COPY domarrtest FROM stdin; -- fail -ERROR: value too long for type character varying(4) -CONTEXT: COPY domarrtest, line 1, column testchar4arr: "{qwerty,w,e}" +ERROR: Error while running COPY select * from domarrtest order by 1, 2; testint4arr | testchar4arr ---------------+--------------------- @@ -174,8 +172,7 @@ INSERT INTO nulltest values ('a', 'b', 'c', NULL, 'd'); -- Good COPY nulltest FROM stdin; --fail ERROR: Error while running COPY COPY nulltest FROM stdin; --fail -ERROR: domain dcheck does not allow null values -CONTEXT: COPY nulltest, line 1, column col5: null input +ERROR: Error while running COPY -- Last row is bad COPY nulltest FROM stdin; ERROR: Error while running COPY diff --git a/src/test/regress/expected/xc_distkey.out b/src/test/regress/expected/xc_distkey.out index d050b27..819952a 100644 --- a/src/test/regress/expected/xc_distkey.out +++ b/src/test/regress/expected/xc_distkey.out @@ -451,15 +451,15 @@ select * from ts_tab order by a; (2 rows) select * from ts_tab where a = 'May 10, 2011 00:01:02.03'; - a ------------------------- - 2011-05-10 00:01:02.03 + a +----------------------------- + Tue May 10 00:01:02.03 2011 (1 row) select * from ts_tab where a = 'August 14, 2001 23:59:59.99'; - a ------------------------- - 2001-08-14 23:59:59.99 + a +----------------------------- + Tue Aug 14 23:59:59.99 2001 (1 row) create table in_tab(a interval) distribute by modulo(a); @@ -517,15 +517,15 @@ select * from atim_tab order by a; (2 rows) select * from atim_tab where a = abstime('May 10, 2011 00:01:02.03'); - a ------------------------- - 2011-05-10 12:01:02+05 + a +------------------------------ + Tue May 10 00:01:02 2011 PDT (1 row) select * from atim_tab where a = abstime('Jun 23, 2001 23:59:59.99'); - a ------------------------- - 2001-06-24 11:59:59+05 + a +------------------------------ + Sat Jun 23 23:59:59 2001 PDT (1 row) create table rtim_tab(a reltime) distribute by modulo(a); @@ -563,13 +563,13 @@ select * from date_tab order by a; select * from date_tab where a = 'May 10, 2011'; a ------------ - 2011-05-10 + 05-10-2011 (1 row) select * from date_tab where a = 'August 23, 2001'; a ------------ - 2001-08-23 + 08-23-2001 (1 row) create table tstz_tab(a timestamp with time zone) distribute by modulo(a); @@ -583,15 +583,15 @@ select * from tstz_tab order by a; (2 rows) select * from tstz_tab where a = 'May 10, 2011 00:01:02.03 PST'; - a ---------------------------- - 2011-05-10 13:01:02.03+05 + a +--------------------------------- + Tue May 10 01:01:02.03 2011 PDT (1 row) select * from tstz_tab where a = 'Jun 23, 2001 23:59:59.99 PST'; - a ---------------------------- - 2001-06-24 12:59:59.99+05 + a +--------------------------------- + Sun Jun 24 00:59:59.99 2001 PDT (1 row) create table tstz_tab_h(a timestamp with time zone) distribute by hash(a); @@ -605,14 +605,14 @@ select * from tstz_tab_h order by a; (2 rows) select * from tstz_tab_h where a = 'May 10, 2011 00:01:02.03 PST'; - a ---------------------------- - 2011-05-10 13:01:02.03+05 + a +--------------------------------- + Tue May 10 01:01:02.03 2011 PDT (1 row) select * from tstz_tab_h where a = 'Jun 23, 2001 23:59:59.99 PST'; - a ---------------------------- - 2001-06-24 12:59:59.99+05 + a +--------------------------------- + Sun Jun 24 00:59:59.99 2001 PDT (1 row) diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule index 658f930..6b58aa7 100644 --- a/src/test/regress/serial_schedule +++ b/src/test/regress/serial_schedule @@ -42,7 +42,7 @@ test: comments test: geometry #After supporting other data types as distribution key, this test case crashes the server #Bug ID 3306801 tracks this crash -#test: horology +test: horology test: oidjoins test: type_sanity test: opr_sanity ----------------------------------------------------------------------- Summary of changes: src/backend/access/common/heaptuple.c | 7 +- src/backend/pgxc/pool/execRemote.c | 159 ++++++++++++++-------- src/backend/pgxc/pool/pgxcnode.c | 218 ++++++++++++++++++++++++++---- src/backend/pgxc/pool/poolcomm.c | 4 +- src/backend/pgxc/pool/poolmgr.c | 154 ++++++++++++++++++++-- src/backend/utils/error/elog.c | 18 +++ src/include/access/xact.h | 1 - src/include/pgxc/execRemote.h | 7 +- src/include/pgxc/pgxcnode.h | 11 ++- src/include/pgxc/poolmgr.h | 4 + src/test/regress/expected/domain_1.out | 9 +- src/test/regress/expected/xc_distkey.out | 52 ++++---- src/test/regress/serial_schedule | 2 +- 13 files changed, 513 insertions(+), 133 deletions(-) hooks/post-receive -- Postgres-XC |
From: Pavan D. <pa...@us...> - 2011-06-24 08:30:26
|
Project "Postgres-XC". The branch, master has been updated via d56caa5e2ac517b83595586987794337c9dea357 (commit) via 097c3c3816c410c6b570c6ef9aa656d4e1f9da2e (commit) via bc8d2c2e0127a90a7f5b01eb3a9be2673c2b4c04 (commit) via a6b077003d974ba5ab612d557eab811d9efc934b (commit) via ad889ead370a7061b9fa57d3b8ce8816b8c251f4 (commit) via b72426e3b1c0c13cd710781a1ff6fd65a96e82d8 (commit) via 9db03183fe491e60dda1a6a5b36b44c55149e077 (commit) via 3cbf503a660e19f5c48c57d3ecd4a746a468cd68 (commit) via 6bbdc5b5befa3ef1f6fbb7a5548b8aa7891873d6 (commit) via 246072c6301bf3e38331ee49e4ff9bd4bd42b9a4 (commit) from 2a828017d88ff64453b37771337646454316269c (commit) - Log ----------------------------------------------------------------- commit d56caa5e2ac517b83595586987794337c9dea357 Merge: 2a82801 097c3c3 Author: Pavan Deolasee <pav...@gm...> Date: Fri Jun 24 13:47:34 2011 +0530 Merge branch 'pgxc-barrier-rebase' into PGXC-master commit 097c3c3816c410c6b570c6ef9aa656d4e1f9da2e Author: Pavan Deolasee <pav...@gm...> Date: Fri Jun 24 13:44:20 2011 +0530 Change the recovery_barrier_id parameter in recovery.conf to recovery_target_barrier to be consistent with other names. Add a sample "recovery_target_barrier" to recovery.conf.sample diff --git a/src/backend/access/transam/recovery.conf.sample b/src/backend/access/transam/recovery.conf.sample index 722c7d6..855c113 100644 --- a/src/backend/access/transam/recovery.conf.sample +++ b/src/backend/access/transam/recovery.conf.sample @@ -68,15 +68,18 @@ # If you want to stop rollforward at a specific point, you # must set a recovery target. # -# You may set a recovery target either by transactionId, or -# by timestamp. Recovery may either include or exclude the -# transaction(s) with the recovery target value (ie, stop either -# just after or just before the given target, respectively). +# You may set a recovery target either by transactionId, +# by timestamp or by barrier id. Recovery may either include or exclude the +# transaction(s) with the recovery target value in case of timestamp or +# transactionId (ie, stop either just after or just before the given target, +# respectively). In case of barrier, the recovery stops exactly at that point # #recovery_target_time = '' # e.g. '2004-07-14 22:39:00 EST' # #recovery_target_xid = '' # +#recovery_target_barrier = '' +# #recovery_target_inclusive = 'true' # # diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ebbe6f0..71ee729 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5237,6 +5237,13 @@ readRecoveryCommandFile(void) (errmsg("recovery_target_time = '%s'", timestamptz_to_str(recoveryTargetTime)))); } +#ifdef PGXC + else if (strcmp(tok1, "recovery_target_barrier") == 0) + { + recoveryTarget = RECOVERY_TARGET_BARRIER; + recoveryTargetBarrierId = pstrdup(tok2); + } +#endif else if (strcmp(tok1, "recovery_target_inclusive") == 0) { /* @@ -5249,13 +5256,6 @@ readRecoveryCommandFile(void) ereport(DEBUG2, (errmsg("recovery_target_inclusive = %s", tok2))); } -#ifdef PGXC - else if (strcmp(tok1, "recovery_barrier_id") == 0) - { - recoveryTarget = RECOVERY_TARGET_BARRIER; - recoveryTargetBarrierId = pstrdup(tok2); - } -#endif else if (strcmp(tok1, "standby_mode") == 0) { if (!parse_bool(tok2, &StandbyMode)) @@ -5279,13 +5279,6 @@ readRecoveryCommandFile(void) (errmsg("trigger_file = '%s'", TriggerFile))); } -#ifdef PGXC - else if (strcmp(tok1, "recovery_barrier_id") == 0) - { - recoveryTarget = true; - recoveryTargetBarrierId = pstrdup(tok2); - } -#endif else ereport(FATAL, (errmsg("unrecognized recovery parameter \"%s\"", commit bc8d2c2e0127a90a7f5b01eb3a9be2673c2b4c04 Author: Pavan Deolasee <pav...@gm...> Date: Thu Jun 23 13:28:10 2011 +0530 If the barrier id not specified in the CREATE BARRIER command, auto-generate a barrier id based on node_id and the current timestamp of the coordinator diff --git a/src/backend/pgxc/barrier/barrier.c b/src/backend/pgxc/barrier/barrier.c index 32ff484..9512cbc 100644 --- a/src/backend/pgxc/barrier/barrier.c +++ b/src/backend/pgxc/barrier/barrier.c @@ -15,6 +15,7 @@ */ #include "postgres.h" +#include "access/gtm.h" #include "libpq/libpq.h" #include "libpq/pqformat.h" #include "pgxc/barrier.h" @@ -139,18 +140,25 @@ ProcessCreateBarrierExecute(const char *id) static const char * generate_barrier_id(const char *id) { + char genid[1024]; + TimestampTz ts; + /* - * TODO If the caller can passed a NULL value, generate an id which is - * guaranteed to be unique across the cluster. We can use a combination of - * the coordinator node id and a timestamp. This may not be complete if we - * support changing coordinator ids without initdb or the system clocks are - * modified. - * - * Another option would be to let the GTM issue globally unique barrier - * IDs (GTM-timestamp based). For the time being, we leave it to the user - * to come up with an unique identifier. + * If the caller can passed a NULL value, generate an id which is + * guaranteed to be unique across the cluster. We use a combination of + * the coordinator node id and current timestamp. */ - return id ? id : pstrdup("dummy_barrier_id"); + + if (id) + return id; + + ts = GetCurrentTimestamp(); +#ifdef HAVE_INT64_TIMESTAMP + sprintf(genid, "%d_"INT64_FORMAT, PGXCNodeId, ts); +#else + sprintf(genid, "%d_%.0f", PGXCNodeId, ts); +#endif + return pstrdup(genid); } static PGXCNodeAllHandles * commit a6b077003d974ba5ab612d557eab811d9efc934b Author: Michael P <mic...@us...> Date: Mon Jun 6 16:35:26 2011 +0900 Correction of spelling mistakes Addition of a couple of compilation flags forgotten. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6c93c21..ebbe6f0 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -39,7 +39,9 @@ #include "funcapi.h" #include "libpq/pqsignal.h" #include "miscadmin.h" +#ifdef PGXC #include "pgxc/barrier.h" +#endif #include "pgstat.h" #include "postmaster/bgwriter.h" #include "replication/walreceiver.h" @@ -4371,6 +4373,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, xlogfname, recoveryStopAfter ? "after" : "before", timestamptz_to_str(recoveryStopTime)); +#ifdef PGXC else if (recoveryTarget == RECOVERY_TARGET_BARRIER) snprintf(buffer, sizeof(buffer), "%s%u\t%s\t%s %s\n", @@ -4379,6 +4382,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, xlogfname, recoveryStopAfter ? "after" : "before", recoveryTargetBarrierId); +#endif else snprintf(buffer, sizeof(buffer), "%s%u\t%s\tno recovery target specified\n", @@ -5492,24 +5496,26 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) return false; record_info = record->xl_info & ~XLR_INFO_MASK; +#ifdef PGXC if (record->xl_rmid == RM_XACT_ID) { - if (record_info == XLOG_XACT_COMMIT) - { - xl_xact_commit *recordXactCommitData; +#endif + if (record_info == XLOG_XACT_COMMIT) + { + xl_xact_commit *recordXactCommitData; - recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); - recordXtime = recordXactCommitData->xact_time; - } - else if (record_info == XLOG_XACT_ABORT) - { - xl_xact_abort *recordXactAbortData; + recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); + recordXtime = recordXactCommitData->xact_time; + } + else if (record_info == XLOG_XACT_ABORT) + { + xl_xact_abort *recordXactAbortData; - recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); - recordXtime = recordXactAbortData->xact_time; - } + recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); + recordXtime = recordXactAbortData->xact_time; } #ifdef PGXC + } /* end if (record->xl_rmid == RM_XACT_ID) */ else if (record->xl_rmid == RM_BARRIER_ID) { if (record_info == XLOG_BARRIER_CREATE) @@ -5883,10 +5889,12 @@ StartupXLOG(void) ereport(LOG, (errmsg("starting point-in-time recovery to %s", timestamptz_to_str(recoveryTargetTime)))); +#ifdef PGXC else if (recoveryTarget == RECOVERY_TARGET_BARRIER) ereport(LOG, (errmsg("starting point-in-time recovery to barrier %s", (recoveryTargetBarrierId)))); +#endif else ereport(LOG, (errmsg("starting archive recovery"))); diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index d1dad1c..01b0f51 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2362,6 +2362,9 @@ _equalValue(Value *a, Value *b) } #ifdef PGXC +/* + * stuff from barrier.h + */ static bool _equalBarrierStmt(BarrierStmt *a, BarrierStmt *b) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 57a5c2b..94b2cd8 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -7008,7 +7008,6 @@ opt_barrier_id: $$ = NULL; } ; - /* PGXC_END */ /***************************************************************************** diff --git a/src/backend/pgxc/barrier/Makefile b/src/backend/pgxc/barrier/Makefile index d80bbec..9505889 100644 --- a/src/backend/pgxc/barrier/Makefile +++ b/src/backend/pgxc/barrier/Makefile @@ -1,7 +1,7 @@ #------------------------------------------------------------------------- # # Makefile-- -# Makefile for pool +# Makefile for barrier # # Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation # diff --git a/src/backend/pgxc/barrier/barrier.c b/src/backend/pgxc/barrier/barrier.c index 1b44f36..32ff484 100644 --- a/src/backend/pgxc/barrier/barrier.c +++ b/src/backend/pgxc/barrier/barrier.c @@ -44,7 +44,7 @@ extern void ProcessCreateBarrierExecute(const char *id); * while all other backend starting a 2PC will grab the lock in shared * mode. So as long as we hold the exclusive lock, no other backend start a * new 2PC and there can not be any 2PC in-progress. This technique would - * rely on assumption that an exclsuive lock requester is not starved by + * rely on assumption that an exclusive lock requester is not starved by * share lock requesters. * * Note: To ensure that the 2PC are not blocked for a long time, we should @@ -76,7 +76,7 @@ ProcessCreateBarrierPrepare(const char *id) } /* - * Mark the completetion of an on-going barrier. We must have remembered the + * Mark the completion of an on-going barrier. We must have remembered the * barrier ID when we received the CREATE BARRIER PREPARE command */ void @@ -103,7 +103,7 @@ ProcessCreateBarrierEnd(const char *id) } /* - * Execute the CREATE BARRIER comamnd. Write a BARRIER WAL record and flush the + * Execute the CREATE BARRIER command. Write a BARRIER WAL record and flush the * WAL buffers to disk before returning to the caller. Writing the WAL record * does not guarantee successful completion of the barrier command. */ @@ -140,15 +140,15 @@ static const char * generate_barrier_id(const char *id) { /* - * TODO If the caller can passeed a NULL value, generate an id which is + * TODO If the caller can passed a NULL value, generate an id which is * guaranteed to be unique across the cluster. We can use a combination of * the coordinator node id and a timestamp. This may not be complete if we * support changing coordinator ids without initdb or the system clocks are * modified. * * Another option would be to let the GTM issue globally unique barrier - * IDs. For the time being, we leave it to the user to come up with an - * unique identifier + * IDs (GTM-timestamp based). For the time being, we leave it to the user + * to come up with an unique identifier. */ return id ? id : pstrdup("dummy_barrier_id"); } @@ -326,7 +326,7 @@ PrepareBarrier(const char *id) */ LWLockAcquire(BarrierLock, LW_EXCLUSIVE); - elog(DEBUG2, "Disabled 2PC commits origniating at the diriving coordinator"); + elog(DEBUG2, "Disabled 2PC commits originating at the driving coordinator"); /* * TODO Start a timer to cancel the barrier request in case of a timeout @@ -375,7 +375,7 @@ ExecuteBarrier(const char *id) if (handle->state != DN_CONNECTION_STATE_IDLE) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send CREATE BARRIER PREPARE request " + errmsg("Failed to send CREATE BARRIER EXECUTE request " "to the node"))); barrier_idlen = strlen(id) + 1; diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 441a625..f8a5c17 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -1931,7 +1931,7 @@ PGXCNodeImplicitCommitPrepared(GlobalTransactionId prepare_xid, * We should acquire the BarrierLock in SHARE mode here to ensure that * there are no in-progress barrier at this point. This mechanism would * work as long as LWLock mechanism does not starve a EXCLUSIVE lock - * requesster + * requester */ LWLockAcquire(BarrierLock, LW_SHARED); res = pgxc_node_implicit_commit_prepared(prepare_xid, commit_xid, diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index d12e81b..ad3eb2e 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -489,7 +489,7 @@ standard_ProcessUtility(Node *parsetree, * * XXX We call FinishPreparedTransaction inside * PGXCNodeCommitPrepared if we are doing a local - * operation. This is convinient because we want to + * operation. This is convenient because we want to * hold on to the BarrierLock until local transaction * is committed too. * diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index f276a1f..591fb53 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -184,8 +184,11 @@ typedef enum { RECOVERY_TARGET_UNSET, RECOVERY_TARGET_XID, - RECOVERY_TARGET_TIME, + RECOVERY_TARGET_TIME +#ifdef PGXC + , RECOVERY_TARGET_BARRIER +#endif } RecoveryTargetType; extern XLogRecPtr XactLastRecEnd; diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 7ba9208..22b3b75 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2302,7 +2302,6 @@ typedef struct BarrierStmt NodeTag type; const char *id; /* User supplied barrier id, if any */ } BarrierStmt; - #endif /* ---------------------- commit ad889ead370a7061b9fa57d3b8ce8816b8c251f4 Author: Pavan Deolasee <pav...@gm...> Date: Thu May 5 17:00:04 2011 +0530 Stop at the appropriate berrier record and set the last time correctly diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6800e68..6c93c21 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5548,14 +5548,16 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) #ifdef PGXC else if (recoveryTarget == RECOVERY_TARGET_BARRIER) { - if ((record->xl_rmid != RM_BARRIER_ID) || - (record_info != XLOG_BARRIER_CREATE)) - return false; - - ereport(DEBUG2, - (errmsg("checking if barrier record matches the target barrier"))); - if (strcmp(recoveryTargetBarrierId, recordBarrierId) == 0) - stopsAtThisBarrier = true; + stopsHere = false; + if ((record->xl_rmid == RM_BARRIER_ID) && + (record_info == XLOG_BARRIER_CREATE)) + { + ereport(DEBUG2, + (errmsg("checking if barrier record matches the target " + "barrier"))); + if (strcmp(recoveryTargetBarrierId, recordBarrierId) == 0) + stopsAtThisBarrier = true; + } } #endif else commit b72426e3b1c0c13cd710781a1ff6fd65a96e82d8 Author: Pavan Deolasee <pav...@gm...> Date: Wed May 4 16:43:02 2011 +0530 WAL log the barrier creation activity on the local coordinator. Also fix some of the bugs in the recovery code. This code was not tasted previously and there were some changes after the 9.0 merge diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 2a465e3..6800e68 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4371,6 +4371,14 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, xlogfname, recoveryStopAfter ? "after" : "before", timestamptz_to_str(recoveryStopTime)); + else if (recoveryTarget == RECOVERY_TARGET_BARRIER) + snprintf(buffer, sizeof(buffer), + "%s%u\t%s\t%s %s\n", + (srcfd < 0) ? "" : "\n", + parentTLI, + xlogfname, + recoveryStopAfter ? "after" : "before", + recoveryTargetBarrierId); else snprintf(buffer, sizeof(buffer), "%s%u\t%s\tno recovery target specified\n", @@ -5240,7 +5248,7 @@ readRecoveryCommandFile(void) #ifdef PGXC else if (strcmp(tok1, "recovery_barrier_id") == 0) { - recoveryTarget = true; + recoveryTarget = RECOVERY_TARGET_BARRIER; recoveryTargetBarrierId = pstrdup(tok2); } #endif @@ -5468,7 +5476,7 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) { bool stopsHere; #ifdef PGXC - bool stopsAtThisBarrier; + bool stopsAtThisBarrier = false; char *recordBarrierId; #endif uint8 record_info; @@ -5482,25 +5490,34 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) if (record->xl_rmid != RM_XACT_ID) #endif return false; + record_info = record->xl_info & ~XLR_INFO_MASK; - if (record_info == XLOG_XACT_COMMIT) + if (record->xl_rmid == RM_XACT_ID) { - xl_xact_commit *recordXactCommitData; + if (record_info == XLOG_XACT_COMMIT) + { + xl_xact_commit *recordXactCommitData; - recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); - recordXtime = recordXactCommitData->xact_time; - } - else if (record_info == XLOG_XACT_ABORT) - { - xl_xact_abort *recordXactAbortData; + recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); + recordXtime = recordXactCommitData->xact_time; + } + else if (record_info == XLOG_XACT_ABORT) + { + xl_xact_abort *recordXactAbortData; - recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); - recordXtime = recordXactAbortData->xact_time; + recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); + recordXtime = recordXactAbortData->xact_time; + } } #ifdef PGXC - else if (record_info == XLOG_BARRIER_CREATE) + else if (record->xl_rmid == RM_BARRIER_ID) { - recordBarrierId = (char *) XLogRecGetData(record); + if (record_info == XLOG_BARRIER_CREATE) + { + recordBarrierId = (char *) XLogRecGetData(record); + ereport(DEBUG2, + (errmsg("processing barrier xlog record for %s", recordBarrierId))); + } } #endif else @@ -5529,8 +5546,14 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) *includeThis = recoveryTargetInclusive; } #ifdef PGXC - else if (recoveryTargetBarrierId) + else if (recoveryTarget == RECOVERY_TARGET_BARRIER) { + if ((record->xl_rmid != RM_BARRIER_ID) || + (record_info != XLOG_BARRIER_CREATE)) + return false; + + ereport(DEBUG2, + (errmsg("checking if barrier record matches the target barrier"))); if (strcmp(recoveryTargetBarrierId, recordBarrierId) == 0) stopsAtThisBarrier = true; } @@ -5858,6 +5881,10 @@ StartupXLOG(void) ereport(LOG, (errmsg("starting point-in-time recovery to %s", timestamptz_to_str(recoveryTargetTime)))); + else if (recoveryTarget == RECOVERY_TARGET_BARRIER) + ereport(LOG, + (errmsg("starting point-in-time recovery to barrier %s", + (recoveryTargetBarrierId)))); else ereport(LOG, (errmsg("starting archive recovery"))); diff --git a/src/backend/pgxc/barrier/barrier.c b/src/backend/pgxc/barrier/barrier.c index 3e1d7cc..1b44f36 100644 --- a/src/backend/pgxc/barrier/barrier.c +++ b/src/backend/pgxc/barrier/barrier.c @@ -414,6 +414,18 @@ ExecuteBarrier(const char *id) /* * Also WAL log the BARRIER locally and flush the WAL buffers to disk */ + { + XLogRecData rdata[1]; + XLogRecPtr recptr; + + rdata[0].data = (char *) id; + rdata[0].len = strlen(id) + 1; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = NULL; + + recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE, rdata); + XLogFlush(recptr); + } } /* diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 27e7f40..f276a1f 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -184,7 +184,8 @@ typedef enum { RECOVERY_TARGET_UNSET, RECOVERY_TARGET_XID, - RECOVERY_TARGET_TIME + RECOVERY_TARGET_TIME, + RECOVERY_TARGET_BARRIER } RecoveryTargetType; extern XLogRecPtr XactLastRecEnd; commit 9db03183fe491e60dda1a6a5b36b44c55149e077 Author: Pavan Deolasee <pav...@gm...> Date: Tue Apr 26 19:54:09 2011 +0530 Rearrange the 2PC commit code so that we can commit the local transaction after releasing the barrier lock. diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 5ee876d..441a625 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -2035,7 +2035,7 @@ finish: * This avoid to have any additional interaction with GTM when making a 2PC transaction. */ void -PGXCNodeCommitPrepared(char *gid, bool isTopLevel) +PGXCNodeCommitPrepared(char *gid) { int res = 0; int res_gtm = 0; @@ -2136,17 +2136,11 @@ finish: * If remote connection is a Coordinator type, the commit prepared has to be done locally * if and only if the Coordinator number was in the node list received from GTM. */ - if (operation_local || IsConnFromCoord()) - { - PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); + if (operation_local) FinishPreparedTransaction(gid, true); - } - /* - * Release the barrier lock now so that pending barriers can get moving - */ LWLockRelease(BarrierLock); - return; + return; } /* @@ -2191,9 +2185,11 @@ finish: /* * Rollback prepared transaction on Datanodes involved in the current transaction + * + * Return whether or not a local operation required. */ -void -PGXCNodeRollbackPrepared(char *gid, bool isTopLevel) +bool +PGXCNodeRollbackPrepared(char *gid) { int res = 0; int res_gtm = 0; @@ -2273,17 +2269,7 @@ finish: (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not rollback prepared transaction on Datanodes"))); - /* - * Local coordinator rollbacks if involved in PREPARE - * If remote connection is a Coordinator type, the commit prepared has to be done locally also. - * This works for both Datanodes and Coordinators. - */ - if (operation_local || IsConnFromCoord()) - { - PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); - FinishPreparedTransaction(gid, false); - } - return; + return operation_local; } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index bc6c630..d12e81b 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -59,6 +59,7 @@ #ifdef PGXC #include "pgxc/barrier.h" +#include "pgxc/execRemote.h" #include "pgxc/locator.h" #include "pgxc/pgxc.h" #include "pgxc/planner.h" @@ -479,32 +480,58 @@ standard_ProcessUtility(Node *parsetree, break; case TRANS_STMT_COMMIT_PREPARED: + PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); + PreventCommandDuringRecovery("COMMIT PREPARED"); #ifdef PGXC /* * If a COMMIT PREPARED message is received from another Coordinator, * Don't send it down to Datanodes. + * + * XXX We call FinishPreparedTransaction inside + * PGXCNodeCommitPrepared if we are doing a local + * operation. This is convinient because we want to + * hold on to the BarrierLock until local transaction + * is committed too. + * */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - PGXCNodeCommitPrepared(stmt->gid, isTopLevel); -#else - PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); - PreventCommandDuringRecovery("COMMIT PREPARED"); + PGXCNodeCommitPrepared(stmt->gid); + else if (IsConnFromCoord()) + { + /* + * A local Coordinator always commits if involved in Prepare. + * 2PC file is created and flushed if a DDL has been involved in the transaction. + * If remote connection is a Coordinator type, the commit prepared has to be done locally + * if and only if the Coordinator number was in the node list received from GTM. + */ +#endif FinishPreparedTransaction(stmt->gid, true); +#ifdef PGXC + } #endif break; case TRANS_STMT_ROLLBACK_PREPARED: + PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); + PreventCommandDuringRecovery("ROLLBACK PREPARED"); #ifdef PGXC /* * If a ROLLBACK PREPARED message is received from another Coordinator, * Don't send it down to Datanodes. */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - PGXCNodeRollbackPrepared(stmt->gid, isTopLevel); -#else - PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); - PreventCommandDuringRecovery("ROLLBACK PREPARED"); - FinishPreparedTransaction(gid, false); + operation_local = PGXCNodeRollbackPrepared(stmt->gid); + /* + * Local coordinator rollbacks if involved in PREPARE + * If remote connection is a Coordinator type, the commit prepared has to be done locally also. + * This works for both Datanodes and Coordinators. + */ + if (operation_local || IsConnFromCoord()) + { +#endif + FinishPreparedTransaction(stmt->gid, false); +#ifdef PGXC + } #endif break; diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 9765632..983a126 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -128,8 +128,8 @@ extern void PGXCNodeSetBeginQuery(char *query_string); extern void PGXCNodeCommit(bool bReleaseHandles); extern int PGXCNodeRollback(void); extern bool PGXCNodePrepare(char *gid); -extern void PGXCNodeRollbackPrepared(char *gid, bool isTopLevel); -extern void PGXCNodeCommitPrepared(char *gid, bool isTopLevel); +extern bool PGXCNodeRollbackPrepared(char *gid); +extern void PGXCNodeCommitPrepared(char *gid); extern bool PGXCNodeIsImplicit2PC(bool *prepare_local_coord); extern int PGXCNodeImplicitPrepare(GlobalTransactionId prepare_xid, char *gid); extern void PGXCNodeImplicitCommitPrepared(GlobalTransactionId prepare_xid, commit 3cbf503a660e19f5c48c57d3ecd4a746a468cd68 Author: Pavan Deolasee <pav...@gm...> Date: Mon Apr 25 17:22:01 2011 +0530 Merge branch 'PGXC-master' into pgxc-barrier Conflicts: src/backend/access/transam/xlog.c src/backend/parser/gram.y src/backend/pgxc/pool/execRemote.c src/backend/tcop/utility.c diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 368cd69..2a465e3 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -5237,6 +5237,13 @@ readRecoveryCommandFile(void) ereport(DEBUG2, (errmsg("recovery_target_inclusive = %s", tok2))); } +#ifdef PGXC + else if (strcmp(tok1, "recovery_barrier_id") == 0) + { + recoveryTarget = true; + recoveryTargetBarrierId = pstrdup(tok2); + } +#endif else if (strcmp(tok1, "standby_mode") == 0) { if (!parse_bool(tok2, &StandbyMode)) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 7ec9491..57a5c2b 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -472,7 +472,8 @@ static TypeName *TableFuncTypeName(List *columns); */ /* ordinary key words in alphabetical order */ -/* PGXC - added REPLICATION, DISTRIBUTE, MODULO, BARRIER and HASH */ +/* PGXC - added DISTRIBUTE, DIRECT, HASH, REPLICATION, ROUND ROBIN, + * COORDINATOR, CLEAN, MODULO, NODE, BARRIER */ %token <keyword> ABORT_P ABSOLUTE_P ACCESS ACTION ADD_P ADMIN AFTER AGGREGATE ALL ALSO ALTER ALWAYS ANALYSE ANALYZE AND ANY ARRAY AS ASC ASSERTION ASSIGNMENT ASYMMETRIC AT AUTHORIZATION @@ -11022,7 +11023,8 @@ ColLabel: IDENT { $$ = $1; } /* "Unreserved" keywords --- available for use as any kind of name. */ -/* PGXC - added DISTRIBUTE, HASH, REPLICATION, MODULO, BARRIER */ +/* PGXC - added DISTRIBUTE, DIRECT, HASH, REPLICATION, ROUND ROBIN, + * COORDINATOR, CLEAN, MODULO, NODE, BARRIER */ unreserved_keyword: ABORT_P | ABSOLUTE_P commit 6bbdc5b5befa3ef1f6fbb7a5548b8aa7891873d6 Author: Pavan Deolasee <pav...@gm...> Date: Mon Apr 18 13:41:47 2011 +0530 Add synchrnization at the commit time diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index c297003..5ee876d 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -17,6 +17,7 @@ #include <time.h> #include "postgres.h" +#include "access/twophase.h" #include "access/gtm.h" #include "access/xact.h" #include "catalog/pg_type.h" @@ -1924,9 +1925,23 @@ PGXCNodeImplicitCommitPrepared(GlobalTransactionId prepare_xid, goto finish; } + /* + * Barrier: + * + * We should acquire the BarrierLock in SHARE mode here to ensure that + * there are no in-progress barrier at this point. This mechanism would + * work as long as LWLock mechanism does not starve a EXCLUSIVE lock + * requesster + */ + LWLockAcquire(BarrierLock, LW_SHARED); res = pgxc_node_implicit_commit_prepared(prepare_xid, commit_xid, pgxc_connections, gid, is_commit); + /* + * Release the BarrierLock. + */ + LWLockRelease(BarrierLock); + finish: /* Clear nodes, signals are clear */ if (!autocommit) @@ -2019,8 +2034,8 @@ finish: * or not but send the message to all of them. * This avoid to have any additional interaction with GTM when making a 2PC transaction. */ -bool -PGXCNodeCommitPrepared(char *gid) +void +PGXCNodeCommitPrepared(char *gid, bool isTopLevel) { int res = 0; int res_gtm = 0; @@ -2070,7 +2085,15 @@ PGXCNodeCommitPrepared(char *gid) /* * Commit here the prepared transaction to all Datanodes and Coordinators * If necessary, local Coordinator Commit is performed after this DataNodeCommitPrepared. + * + * BARRIER: + * + * Take the BarrierLock in SHARE mode to synchronize on in-progress + * barriers. We should hold on to the lock until the local prepared + * transaction is also committed */ + LWLockAcquire(BarrierLock, LW_SHARED); + res = pgxc_node_commit_prepared(gxid, prepared_gxid, pgxc_handles, gid); finish: @@ -2096,6 +2119,7 @@ finish: free(coordinators); pfree_pgxc_all_handles(pgxc_handles); + if (res_gtm < 0) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), @@ -2106,7 +2130,23 @@ finish: (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not commit prepared transaction on data nodes"))); - return operation_local; + /* + * A local Coordinator always commits if involved in Prepare. + * 2PC file is created and flushed if a DDL has been involved in the transaction. + * If remote connection is a Coordinator type, the commit prepared has to be done locally + * if and only if the Coordinator number was in the node list received from GTM. + */ + if (operation_local || IsConnFromCoord()) + { + PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); + FinishPreparedTransaction(gid, true); + } + + /* + * Release the barrier lock now so that pending barriers can get moving + */ + LWLockRelease(BarrierLock); + return; } /* @@ -2151,11 +2191,9 @@ finish: /* * Rollback prepared transaction on Datanodes involved in the current transaction - * - * Return whether or not a local operation required. */ -bool -PGXCNodeRollbackPrepared(char *gid) +void +PGXCNodeRollbackPrepared(char *gid, bool isTopLevel) { int res = 0; int res_gtm = 0; @@ -2235,7 +2273,17 @@ finish: (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not rollback prepared transaction on Datanodes"))); - return operation_local; + /* + * Local coordinator rollbacks if involved in PREPARE + * If remote connection is a Coordinator type, the commit prepared has to be done locally also. + * This works for both Datanodes and Coordinators. + */ + if (operation_local || IsConnFromCoord()) + { + PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); + FinishPreparedTransaction(gid, false); + } + return; } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index d8f697e..bc6c630 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -485,23 +485,11 @@ standard_ProcessUtility(Node *parsetree, * Don't send it down to Datanodes. */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - operation_local = PGXCNodeCommitPrepared(stmt->gid); -#endif + PGXCNodeCommitPrepared(stmt->gid, isTopLevel); +#else PreventTransactionChain(isTopLevel, "COMMIT PREPARED"); PreventCommandDuringRecovery("COMMIT PREPARED"); -#ifdef PGXC - /* - * A local Coordinator always commits if involved in Prepare. - * 2PC file is created and flushed if a DDL has been involved in the transaction. - * If remote connection is a Coordinator type, the commit prepared has to be done locally - * if and only if the Coordinator number was in the node list received from GTM. - */ - if (operation_local || IsConnFromCoord()) - { -#endif FinishPreparedTransaction(stmt->gid, true); -#ifdef PGXC - } #endif break; @@ -512,22 +500,11 @@ standard_ProcessUtility(Node *parsetree, * Don't send it down to Datanodes. */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - operation_local = PGXCNodeRollbackPrepared(stmt->gid); -#endif + PGXCNodeRollbackPrepared(stmt->gid, isTopLevel); +#else PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED"); PreventCommandDuringRecovery("ROLLBACK PREPARED"); -#ifdef PGXC - /* - * Local coordinator rollbacks if involved in PREPARE - * If remote connection is a Coordinator type, the commit prepared has to be done locally also. - * This works for both Datanodes and Coordinators. - */ - if (operation_local || IsConnFromCoord()) - { -#endif - FinishPreparedTransaction(stmt->gid, false); -#ifdef PGXC - } + FinishPreparedTransaction(gid, false); #endif break; diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 076100b..9765632 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -128,8 +128,8 @@ extern void PGXCNodeSetBeginQuery(char *query_string); extern void PGXCNodeCommit(bool bReleaseHandles); extern int PGXCNodeRollback(void); extern bool PGXCNodePrepare(char *gid); -extern bool PGXCNodeRollbackPrepared(char *gid); -extern bool PGXCNodeCommitPrepared(char *gid); +extern void PGXCNodeRollbackPrepared(char *gid, bool isTopLevel); +extern void PGXCNodeCommitPrepared(char *gid, bool isTopLevel); extern bool PGXCNodeIsImplicit2PC(bool *prepare_local_coord); extern int PGXCNodeImplicitPrepare(GlobalTransactionId prepare_xid, char *gid); extern void PGXCNodeImplicitCommitPrepared(GlobalTransactionId prepare_xid, commit 246072c6301bf3e38331ee49e4ff9bd4bd42b9a4 Author: Pavan Deolasee <pav...@gm...> Date: Tue Mar 8 16:45:12 2011 +0530 First cut implementation of BARRIER for PITR and global consistent recovery diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 8038b25..d989a59 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -20,11 +20,13 @@ #include "commands/dbcommands.h" #include "commands/sequence.h" #include "commands/tablespace.h" +#ifdef PGXC +#include "pgxc/barrier.h" +#endif #include "storage/freespace.h" #include "storage/standby.h" #include "utils/relmapper.h" - const RmgrData RmgrTable[RM_MAX_ID + 1] = { {"XLOG", xlog_redo, xlog_desc, NULL, NULL, NULL}, {"Transaction", xact_redo, xact_desc, NULL, NULL, NULL}, @@ -42,4 +44,8 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = { {"Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, gin_safe_restartpoint}, {"Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, gist_safe_restartpoint}, {"Sequence", seq_redo, seq_desc, NULL, NULL, NULL} +#ifdef PGXC + , + {"Barrier", barrier_redo, barrier_desc, NULL, NULL, NULL} +#endif }; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7fbccc5..368cd69 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -39,6 +39,7 @@ #include "funcapi.h" #include "libpq/pqsignal.h" #include "miscadmin.h" +#include "pgxc/barrier.h" #include "pgstat.h" #include "postmaster/bgwriter.h" #include "replication/walreceiver.h" @@ -184,6 +185,7 @@ static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET; static bool recoveryTargetInclusive = true; static TransactionId recoveryTargetXid; static TimestampTz recoveryTargetTime; +static char *recoveryTargetBarrierId; /* options taken from recovery.conf for XLOG streaming */ static bool StandbyMode = false; @@ -5258,6 +5260,13 @@ readRecoveryCommandFile(void) (errmsg("trigger_file = '%s'", TriggerFile))); } +#ifdef PGXC + else if (strcmp(tok1, "recovery_barrier_id") == 0) + { + recoveryTarget = true; + recoveryTargetBarrierId = pstrdup(tok2); + } +#endif else ereport(FATAL, (errmsg("unrecognized recovery parameter \"%s\"", @@ -5451,11 +5460,20 @@ static bool recoveryStopsHere(XLogRecord *record, bool *includeThis) { bool stopsHere; +#ifdef PGXC + bool stopsAtThisBarrier; + char *recordBarrierId; +#endif uint8 record_info; TimestampTz recordXtime; +#ifdef PGXC + /* We only consider stoppping at COMMIT, ABORT or BARRIER records */ + if ((record->xl_rmid != RM_XACT_ID) && (record->xl_rmid != RM_BARRIER_ID)) +#else /* We only consider stopping at COMMIT or ABORT records */ if (record->xl_rmid != RM_XACT_ID) +#endif return false; record_info = record->xl_info & ~XLR_INFO_MASK; if (record_info == XLOG_XACT_COMMIT) @@ -5472,6 +5490,12 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); recordXtime = recordXactAbortData->xact_time; } +#ifdef PGXC + else if (record_info == XLOG_BARRIER_CREATE) + { + recordBarrierId = (char *) XLogRecGetData(record); + } +#endif else return false; @@ -5497,6 +5521,13 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) if (stopsHere) *includeThis = recoveryTargetInclusive; } +#ifdef PGXC + else if (recoveryTargetBarrierId) + { + if (strcmp(recoveryTargetBarrierId, recordBarrierId) == 0) + stopsAtThisBarrier = true; + } +#endif else { /* @@ -5548,6 +5579,17 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) if (recoveryStopAfter) SetLatestXTime(recordXtime); } +#ifdef PGXC + else if (stopsAtThisBarrier) + { + recoveryStopTime = recordXtime; + ereport(LOG, + (errmsg("recovery stopping at barrier %s, time %s", + recoveryTargetBarrierId, + timestamptz_to_str(recoveryStopTime)))); + return true; + } +#endif else SetLatestXTime(recordXtime); diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 2c7fee4..c9581e1 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -3726,6 +3726,18 @@ _copyValue(Value *from) return newnode; } +#ifdef PGXC +static BarrierStmt * +_copyBarrierStmt(BarrierStmt *from) +{ + BarrierStmt *newnode = makeNode(BarrierStmt); + + COPY_STRING_FIELD(id); + + return newnode; +} +#endif + /* * copyObject * @@ -4307,6 +4319,11 @@ copyObject(void *from) case T_CheckPointStmt: retval = (void *) makeNode(CheckPointStmt); break; +#ifdef PGXC + case T_BarrierStmt: + retval = _copyBarrierStmt(from); + break; +#endif case T_CreateSchemaStmt: retval = _copyCreateSchemaStmt(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index c5b46bb..d1dad1c 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2361,6 +2361,16 @@ _equalValue(Value *a, Value *b) return true; } +#ifdef PGXC + +static bool +_equalBarrierStmt(BarrierStmt *a, BarrierStmt *b) +{ + COMPARE_STRING_FIELD(id); + return true; +} +#endif + /* * equal * returns whether two nodes are equal @@ -2811,6 +2821,11 @@ equal(void *a, void *b) case T_CheckPointStmt: retval = true; break; +#ifdef PGXC + case T_BarrierStmt: + retval = _equalBarrierStmt(a, b); + break; +#endif case T_CreateSchemaStmt: retval = _equalCreateSchemaStmt(a, b); break; diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 8ac6002..7ec9491 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -216,6 +216,7 @@ static TypeName *TableFuncTypeName(List *columns); DeallocateStmt PrepareStmt ExecuteStmt DropOwnedStmt ReassignOwnedStmt AlterTSConfigurationStmt AlterTSDictionaryStmt + BarrierStmt %type <node> select_no_parens select_with_parens select_clause simple_select values_clause @@ -445,6 +446,7 @@ static TypeName *TableFuncTypeName(List *columns); opt_frame_clause frame_extent frame_bound %type <str> opt_existing_window_name /* PGXC_BEGIN */ +%type <str> opt_barrier_id %type <distby> OptDistributeBy /* PGXC_END */ @@ -470,12 +472,12 @@ static TypeName *TableFuncTypeName(List *columns); */ /* ordinary key words in alphabetical order */ -/* PGXC - added REPLICATION, DISTRIBUTE, MODULO and HASH */ +/* PGXC - added REPLICATION, DISTRIBUTE, MODULO, BARRIER and HASH */ %token <keyword> ABORT_P ABSOLUTE_P ACCESS ACTION ADD_P ADMIN AFTER AGGREGATE ALL ALSO ALTER ALWAYS ANALYSE ANALYZE AND ANY ARRAY AS ASC ASSERTION ASSIGNMENT ASYMMETRIC AT AUTHORIZATION - BACKWARD BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT + BACKWARD BARRIER BEFORE BEGIN_P BETWEEN BIGINT BINARY BIT BOOLEAN_P BOTH BY CACHE CALLED CASCADE CASCADED CASE CAST CATALOG_P CHAIN CHAR_P @@ -683,6 +685,7 @@ stmt : | AlterUserSetStmt | AlterUserStmt | AnalyzeStmt + | BarrierStmt | CheckPointStmt | CleanConnStmt | ClosePortalStmt @@ -6985,6 +6988,28 @@ opt_name_list: ; +/* PGXC_BEGIN */ +BarrierStmt: CREATE BARRIER opt_barrier_id + { + BarrierStmt *n = makeNode(BarrierStmt); + n->id = $3; + $$ = (Node *)n; + } + ; + +opt_barrier_id: + Sconst + { + $$ = pstrdup($1); + } + | /* EMPTY */ + { + $$ = NULL; + } + ; + +/* PGXC_END */ + /***************************************************************************** * * QUERY: @@ -10997,7 +11022,7 @@ ColLabel: IDENT { $$ = $1; } /* "Unreserved" keywords --- available for use as any kind of name. */ -/* PGXC - added DISTRIBUTE, HASH, REPLICATION, MODULO */ +/* PGXC - added DISTRIBUTE, HASH, REPLICATION, MODULO, BARRIER */ unreserved_keyword: ABORT_P | ABSOLUTE_P @@ -11014,6 +11039,9 @@ unreserved_keyword: | ASSIGNMENT | AT | BACKWARD +/* PGXC_BEGIN */ + | BARRIER +/* PGXC_END */ | BEFORE | BEGIN_P | BY diff --git a/src/backend/pgxc/Makefile b/src/backend/pgxc/Makefile index eecac20..ad6bb64 100644 --- a/src/backend/pgxc/Makefile +++ b/src/backend/pgxc/Makefile @@ -11,6 +11,6 @@ subdir = src/backend/pgxc top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -SUBDIRS = locator plan pool +SUBDIRS = locator plan pool barrier include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/barrier/Makefile b/src/backend/pgxc/barrier/Makefile new file mode 100644 index 0000000..d80bbec --- /dev/null +++ b/src/backend/pgxc/barrier/Makefile @@ -0,0 +1,19 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for pool +# +# Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation +# +# IDENTIFICATION +# $PostgreSQL$ +# +#------------------------------------------------------------------------- + +subdir = src/backend/pgxc/barrier +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = barrier.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/pgxc/barrier/barrier.c b/src/backend/pgxc/barrier/barrier.c new file mode 100644 index 0000000..3e1d7cc --- /dev/null +++ b/src/backend/pgxc/barrier/barrier.c @@ -0,0 +1,493 @@ +/*------------------------------------------------------------------------- + * + * barrier.c + * + * Barrier handling for PITR + * + * + * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group + * Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation + * + * IDENTIFICATION + * $$ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "libpq/libpq.h" +#include "libpq/pqformat.h" +#include "pgxc/barrier.h" +#include "pgxc/execRemote.h" +#include "pgxc/locator.h" +#include "pgxc/pgxc.h" +#include "pgxc/pgxcnode.h" +#include "storage/lwlock.h" +#include "tcop/dest.h" + +static const char *generate_barrier_id(const char *id); +static PGXCNodeAllHandles *PrepareBarrier(const char *id); +static void ExecuteBarrier(const char *id); +static void EndBarrier(PGXCNodeAllHandles *handles, const char *id); + +extern void ProcessCreateBarrierPrepare(const char *id); +extern void ProcessCreateBarrierEnd(const char *id); +extern void ProcessCreateBarrierExecute(const char *id); + +/* + * Prepare ourselves for an incoming BARRIER. We must disable all new 2PC + * commits and let the ongoing commits to finish. We then remember the + * barrier id (so that it can be matched with the final END message) and + * tell the driving coordinator to proceed with the next step. + * + * A simple way to implement this is to grab a lock in an exclusive mode + * while all other backend starting a 2PC will grab the lock in shared + * mode. So as long as we hold the exclusive lock, no other backend start a + * new 2PC and there can not be any 2PC in-progress. This technique would + * rely on assumption that an exclsuive lock requester is not starved by + * share lock requesters. + * + * Note: To ensure that the 2PC are not blocked for a long time, we should + * set a timeout. The lock should be release after the timeout and the + * barrier should be canceled. + */ +void +ProcessCreateBarrierPrepare(const char *id) +{ + StringInfoData buf; + + if (!IS_PGXC_COORDINATOR || !IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("The CREATE BARRIER PREPARE message is expected to " + "arrive at a coordinator from another coordinator"))); + + LWLockAcquire(BarrierLock, LW_EXCLUSIVE); + + pq_beginmessage(&buf, 'b'); + pq_sendstring(&buf, id); + pq_endmessage(&buf); + pq_flush(); + + /* + * TODO Start a timer to terminate the pending barrier after a specified + * timeout + */ +} + +/* + * Mark the completetion of an on-going barrier. We must have remembered the + * barrier ID when we received the CREATE BARRIER PREPARE command + */ +void +ProcessCreateBarrierEnd(const char *id) +{ + StringInfoData buf; + + if (!IS_PGXC_COORDINATOR || !IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("The CREATE BARRIER END message is expected to " + "arrive at a coordinator from another coordinator"))); + + LWLockRelease(BarrierLock); + + pq_beginmessage(&buf, 'b'); + pq_sendstring(&buf, id); + pq_endmessage(&buf); + pq_flush(); + + /* + * TODO Stop the timer + */ +} + +/* + * Execute the CREATE BARRIER comamnd. Write a BARRIER WAL record and flush the + * WAL buffers to disk before returning to the caller. Writing the WAL record + * does not guarantee successful completion of the barrier command. + */ +void +ProcessCreateBarrierExecute(const char *id) +{ + StringInfoData buf; + + if (!IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("The CREATE BARRIER EXECUTE message is expected to " + "arrive from a coordinator"))); + { + XLogRecData rdata[1]; + XLogRecPtr recptr; + + rdata[0].data = (char *) id; + rdata[0].len = strlen(id) + 1; + rdata[0].buffer = InvalidBuffer; + rdata[0].next = NULL; + + recptr = XLogInsert(RM_BARRIER_ID, XLOG_BARRIER_CREATE, rdata); + XLogFlush(recptr); + } + + pq_beginmessage(&buf, 'b'); + pq_sendstring(&buf, id); + pq_endmessage(&buf); + pq_flush(); +} + +static const char * +generate_barrier_id(const char *id) +{ + /* + * TODO If the caller can passeed a NULL value, generate an id which is + * guaranteed to be unique across the cluster. We can use a combination of + * the coordinator node id and a timestamp. This may not be complete if we + * support changing coordinator ids without initdb or the system clocks are + * modified. + * + * Another option would be to let the GTM issue globally unique barrier + * IDs. For the time being, we leave it to the user to come up with an + * unique identifier + */ + return id ? id : pstrdup("dummy_barrier_id"); +} + +static PGXCNodeAllHandles * +SendBarrierPrepareRequest(List *coords, const char *id) +{ + PGXCNodeAllHandles *coord_handles; + int conn; + int msglen; + int barrier_idlen; + + coord_handles = get_handles(NIL, coords, true); + + for (conn = 0; conn < coord_handles->co_conn_count; conn++) + { + PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; + + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send CREATE BARRIER PREPARE request " + "to the node"))); + + barrier_idlen = strlen(id) + 1; + + msglen = 4; /* for the length itself */ + msglen += barrier_idlen; + msglen += 1; /* for barrier command itself */ + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Out of memory"))); + } + + handle->outBuffer[handle->outEnd++] = 'b'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; + + handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_PREPARE; + + memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); + handle->outEnd += barrier_idlen; + + handle->state = DN_CONNECTION_STATE_QUERY; + + pgxc_node_flush(handle); + + /* FIXME Use the right context */ + handle->barrier_id = strdup(id); + } + + return coord_handles; +} + +static void +CheckBarrierCommandStatus(PGXCNodeAllHandles *conn_handles, const char *id, + const char *command) +{ + int conn; + int count = conn_handles->co_conn_count + conn_handles->dn_conn_count; + + elog(DEBUG2, "Check CREATE BARRIER <%s> %s command status", id, command); + + for (conn = 0; conn < count; conn++) + { + PGXCNodeHandle *handle; + + if (conn < conn_handles->co_conn_count) + handle = conn_handles->coord_handles[conn]; + else + handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count]; + + if (pgxc_node_receive(1, &handle, NULL)) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to receive response from the remote side"))); + + if (handle_response(handle, NULL) != RESPONSE_BARRIER_OK) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("CREATE BARRIER PREPARE command failed " + "with error %s", handle->error))); + } + + elog(DEBUG2, "Successfully completed CREATE BARRIER <%s> %s command on " + "all nodes", id, command); +} + +static void +SendBarrierEndRequest(PGXCNodeAllHandles *coord_handles, const char *id) +{ + int conn; + int msglen; + int barrier_idlen; + + elog(DEBUG2, "Sending CREATE BARRIER <%s> END command to all coordinators", id); + + for (conn = 0; conn < coord_handles->co_conn_count; conn++) + { + PGXCNodeHandle *handle = coord_handles->coord_handles[conn]; + + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send CREATE BARRIER PREPARE request " + "to the node"))); + + barrier_idlen = strlen(id) + 1; + + msglen = 4; /* for the length itself */ + msglen += barrier_idlen; + msglen += 1; /* for barrier command itself */ + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Out of memory"))); + } + + handle->outBuffer[handle->outEnd++] = 'b'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; + + handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_END; + + memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); + handle->outEnd += barrier_idlen; + + handle->state = DN_CONNECTION_STATE_QUERY; + pgxc_node_flush(handle); + + /* FIXME Use the right context */ + handle->barrier_id = strdup(id); + } + +} + +/* + * Prepare all coordinators for barrier. During this step all the coordinators + * are informed to suspend any new 2PC transactions. The coordinators should + * disable new 2PC transactions and then wait for the existing transactions to + * complete. Once all "in-flight" 2PC transactions are over, the coordinators + * respond back. + * + * That completes the first step in barrier generation + * + * Any errors will be reported via ereport. + */ +static PGXCNodeAllHandles * +PrepareBarrier(const char *id) +{ + PGXCNodeAllHandles *coord_handles; + + elog(DEBUG2, "Preparing coordinators for BARRIER"); + + /* + * Send a CREATE BARRIER PREPARE message to all the coordinators. We should + * send an asynchronous request so that we can disable local commits and + * then wait for the remote coordinators to finish the work + */ + coord_handles = SendBarrierPrepareRequest(GetAllCoordNodes(), id); + + /* + * Disable local commits + */ + LWLockAcquire(BarrierLock, LW_EXCLUSIVE); + + elog(DEBUG2, "Disabled 2PC commits origniating at the diriving coordinator"); + + /* + * TODO Start a timer to cancel the barrier request in case of a timeout + */ + + /* + * Local in-flight commits are now over. Check status of the remote + * coordinators + */ + CheckBarrierCommandStatus(coord_handles, id, "PREPARE"); + + return coord_handles; +} + +/* + * Execute the barrier command on all the components, including data nodes and + * coordinators. + */ +static void +ExecuteBarrier(const char *id) +{ + List *barrierDataNodeList = GetAllDataNodes(); + List *barrierCoordList = GetAllCoordNodes(); + PGXCNodeAllHandles *conn_handles; + int conn; + int msglen; + int barrier_idlen; + + conn_handles = get_handles(barrierDataNodeList, barrierCoordList, false); + + elog(DEBUG2, "Sending CREATE BARRIER <%s> EXECUTE message to " + "data nodes and coordinator", id); + /* + * Send a CREATE BARRIER request to all the data nodes and the coordinators + */ + for (conn = 0; conn < conn_handles->co_conn_count + conn_handles->dn_conn_count; conn++) + { + PGXCNodeHandle *handle; + + if (conn < conn_handles->co_conn_count) + handle = conn_handles->coord_handles[conn]; + else + handle = conn_handles->datanode_handles[conn - conn_handles->co_conn_count]; + + /* Invalid connection state, return error */ + if (handle->state != DN_CONNECTION_STATE_IDLE) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Failed to send CREATE BARRIER PREPARE request " + "to the node"))); + + barrier_idlen = strlen(id) + 1; + + msglen = 4; /* for the length itself */ + msglen += barrier_idlen; + msglen += 1; /* for barrier command itself */ + + /* msgType + msgLen */ + if (ensure_out_buffer_capacity(handle->outEnd + 1 + msglen, handle) != 0) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Out of memory"))); + } + + handle->outBuffer[handle->outEnd++] = 'b'; + msglen = htonl(msglen); + memcpy(handle->outBuffer + handle->outEnd, &msglen, 4); + handle->outEnd += 4; + + handle->outBuffer[handle->outEnd++] = CREATE_BARRIER_EXECUTE; + + memcpy(handle->outBuffer + handle->outEnd, id, barrier_idlen); + handle->outEnd += barrier_idlen; + + handle->state = DN_CONNECTION_STATE_QUERY; + pgxc_node_flush(handle); + + /* FIXME Use the right context */ + handle->barrier_id = strdup(id); + } + + CheckBarrierCommandStatus(conn_handles, id, "EXECUTE"); + + /* + * Also WAL log the BARRIER locally and flush the WAL buffers to disk + */ +} + +/* + * Resume 2PC commits on the local as well as remote coordinators. + */ +static void +EndBarrier(PGXCNodeAllHandles *prepared_handles, const char *id) +{ + /* Resume 2PC locally */ + LWLockRelease(BarrierLock); + + SendBarrierEndRequest(prepared_handles, id); + + CheckBarrierCommandStatus(prepared_handles, id, "END"); +} + +void +RequestBarrier(const char *id, char *completionTag) +{ + PGXCNodeAllHandles *prepared_handles; + const char *barrier_id; + + elog(DEBUG2, "CREATE BARRIER request received"); + /* + * Ensure that we are a coordinator and the request is not from another + * coordinator + */ + if (!IS_PGXC_COORDINATOR) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("CREATE BARRIER command must be sent to a coordinator"))); + + if (IsConnFromCoord()) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("CREATE BARRIER command is not expected from another coordinator"))); + + /* + * Get a barrier id if the user has not supplied it + */ + barrier_id = generate_barrier_id(id); + + elog(DEBUG2, "CREATE BARRIER <%s>", barrier_id); + + /* + * Step One. Prepare all coordinators for upcoming barrier request + */ + prepared_handles = PrepareBarrier(barrier_id); + + /* + * Step two. Issue BARRIER command to all involved components, including + * coordinators and data nodes + */ + ExecuteBarrier(barrier_id); + + /* + * Step three. Inform coordinators about a successfully completed barrier + */ + EndBarrier(prepared_handles, barrier_id); + + if (completionTag) + snprintf(completionTag, COMPLETION_TAG_BUFSIZE, "BARRIER %s", barrier_id); +} + +void +barrier_redo(XLogRecPtr lsn, XLogRecord *record) +{ + /* Nothing to do */ + return; +} + +void +barrier_desc(StringInfo buf, uint8 xl_info, char *rec) +{ + Assert(xl_info == XLOG_BARRIER_CREATE); + appendStringInfo(buf, "BARRIER %s", rec); +} diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 03482a0..c297003 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -1389,6 +1389,7 @@ pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle ** connections, * RESPONSE_TUPLEDESC - got tuple description * RESPONSE_DATAROW - got data row * RESPONSE_COPY - got copy response + * RESPONSE_BARRIER_OK - barrier command completed successfully */ int handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) @@ -1500,6 +1501,16 @@ handle_response(PGXCNodeHandle * conn, RemoteQueryState *combiner) #endif return result; } + +#ifdef PGXC + case 'b': + { + Assert((strncmp(msg, conn->barrier_id, msg_len) == 0)); + conn->state = DN_CONNECTION_STATE_IDLE; + return RESPONSE_BARRIER_OK; + } +#endif + case 'I': /* EmptyQuery */ default: /* sync lost? */ diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 71b1398..c54bd60 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -80,6 +80,7 @@ #include "access/gtm.h" /* PGXC_COORD */ #include "pgxc/execRemote.h" +#include "pgxc/barrier.h" #include "pgxc/planner.h" #include "pgxc/pgxcnode.h" #include "commands/copy.h" @@ -447,6 +448,7 @@ SocketBackend(StringInfo inBuf) case 'g': /* GXID */ case 's': /* Snapshot */ case 't': /* Timestamp */ + case 'b': /* Barrier */ break; #endif @@ -4290,6 +4292,37 @@ PostgresMain(int argc, char *argv[], const char *username) */ SetCurrentGTMDeltaTimestamp(timestamp); break; + + case 'b': /* barrier */ + { + int command; + char *id; + + command = pq_getmsgbyte(&input_message); + id = pq_getmsgstring(&input_message); + pq_... [truncated message content] |
From: Ashutosh B. <ash...@us...> - 2011-06-21 06:59:49
|
Project "Postgres-XC". The branch, master has been updated via 2a828017d88ff64453b37771337646454316269c (commit) from 88a19b42f3b599927b74e0aef2e19b9161b3a7eb (commit) - Log ----------------------------------------------------------------- commit 2a828017d88ff64453b37771337646454316269c Author: Ashutosh Bapat <ash...@en...> Date: Tue Jun 21 12:19:07 2011 +0530 The patch has following changes: 1. Any query with aggregates in it, is planned through standard planner, instead of directly creating a RemoteQuery node for it in pgxc_planner(). Before this fix, any query which involved aggregates and no grouping clause, was directly converted into a RemoteQuery node. RemoteQuery had no mechanism to handle aggregates involved in expressions properly (3237756). It could only handle aggregates without any covering node. Also, aggregates with order by clause would cause aggregates grouped according to datanodes storing the rows (3237712, 3147936). With the fix, such queries are planned through standard planner and optimized in grouping_planner to push the aggregates and/or group by clauses to the datanodes. This change made the aggregation mechanism under RemoteQuery node unnecessary. Hence cleaned up the same. 2. If there are any qualifications in the query, which need to be evaluated on the coordinator only, those are considered as local quals and are stuck in RemoteQuery node. There was no mechanism to handle these quals in ExecRemoteQuery(). Added that mechanism. 3. Fixed a bug in create_remotegrouping_planner() - When the RemoteQuery node has quals in it, (i.e. there are local quals), we can not push the aggregates and GROUP BY clause to the data node, because before aggregating, we need to apply those scan clauses, which can happen only on coordinator. 4. There are couple of expected output changes, related to the explain verbose (in create_index test). The patch fixes bugs 3237712, 3147936 and 3237756. diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c index ccf5079..ae60a76 100644 --- a/src/backend/executor/nodeAgg.c +++ b/src/backend/executor/nodeAgg.c @@ -921,7 +921,8 @@ finalize_aggregate(AggState *aggstate, */ if (OidIsValid(peraggstate->collectfn_oid) && !aggstate->skip_trans) { - FunctionCallInfoData fcinfo; + FunctionCallInfoData fcinfo; + int saved_numArguments; InitFunctionCallInfoData(fcinfo, &(peraggstate->collectfn), 2, (void *) aggstate, NULL); /* @@ -937,24 +938,19 @@ finalize_aggregate(AggState *aggstate, fcinfo.argnull[0] = peraggstate->initCollectValueIsNull; fcinfo.arg[1] = pergroupstate->transValue; fcinfo.argnull[1] = pergroupstate->transValueIsNull; - if (fcinfo.flinfo->fn_strict && - (pergroupstate->transValueIsNull || peraggstate->initCollectValueIsNull)) - { - pergroupstate->transValue = (Datum)0; - pergroupstate->transValueIsNull = true; - } - else - { - Datum newVal = FunctionCallInvoke(&fcinfo); - - /* - * set the result of collection function to the transValue so that code - * below invoking final function does not change - */ - /* PGXCTODO: worry about the memory management here? */ - pergroupstate->transValue = newVal; - pergroupstate->transValueIsNull = fcinfo.isnull; - } + /* + * For collection function we expect only one argument other than the + * running collection result. The numArguments in peraggstate + * corresponds to the number of arguments to the aggregate, which is not + * correct for collection. Hence while applying collection function + * set numArguments to 1 and switch it back once the purpose is served. + */ + saved_numArguments = peraggstate->numArguments; + peraggstate->numArguments = 1; + advance_collection_function(aggstate, peraggstate, pergroupstate, &fcinfo); + peraggstate->numArguments = saved_numArguments; + pergroupstate->transValue = pergroupstate->collectValue; + pergroupstate->transValueIsNull = pergroupstate->collectValueIsNull; } /* diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 2c7fee4..607adf5 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -905,7 +905,6 @@ _copyRemoteQuery(RemoteQuery *from) COPY_STRING_FIELD(sql_statement); COPY_NODE_FIELD(exec_nodes); COPY_SCALAR_FIELD(combine_type); - COPY_NODE_FIELD(simple_aggregates); COPY_NODE_FIELD(sort); COPY_NODE_FIELD(distinct); COPY_SCALAR_FIELD(read_only); @@ -953,40 +952,6 @@ _copyExecNodes(ExecNodes *from) } /* - * _copySimpleAgg - */ -static SimpleAgg * -_copySimpleAgg(SimpleAgg *from) -{ - SimpleAgg *newnode = makeNode(SimpleAgg); - - COPY_SCALAR_FIELD(column_pos); - COPY_NODE_FIELD(aggref); - COPY_SCALAR_FIELD(transfn_oid); - COPY_SCALAR_FIELD(finalfn_oid); - COPY_SCALAR_FIELD(arginputfn); - COPY_SCALAR_FIELD(argioparam); - COPY_SCALAR_FIELD(resoutputfn); - COPY_SCALAR_FIELD(transfn); - COPY_SCALAR_FIELD(finalfn); - if (!from->initValueIsNull) - newnode->initValue = datumCopy(from->initValue, from->transtypeByVal, - from->transtypeLen); - COPY_SCALAR_FIELD(initValueIsNull); - COPY_SCALAR_FIELD(inputtypeLen); - COPY_SCALAR_FIELD(resulttypeLen); - COPY_SCALAR_FIELD(transtypeLen); - COPY_SCALAR_FIELD(inputtypeByVal); - COPY_SCALAR_FIELD(resulttypeByVal); - COPY_SCALAR_FIELD(transtypeByVal); - /* No need to copy runtime info, just init */ - newnode->collectValueNull = true; - initStringInfo(&newnode->valuebuf); - - return newnode; -} - -/* * _copySimpleSort */ static SimpleSort * @@ -3860,9 +3825,6 @@ copyObject(void *from) case T_ExecNodes: retval = _copyExecNodes(from); break; - case T_SimpleAgg: - retval = _copySimpleAgg(from); - break; case T_SimpleSort: retval = _copySimpleSort(from); break; diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index a666914..a96619b 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -5158,11 +5158,20 @@ create_remotegrouping_plan(PlannerInfo *root, Plan *local_plan) return local_plan; /* * for Group plan we expect Sort under the Group, which is always the case, - * the condition below is really for some possible non-existent case + * the condition below is really for some possibly non-existent case. */ if (IsA(local_plan, Group) && !sort_plan) return local_plan; - + /* + * If the remote_scan has any quals on it, those need to be executed before + * doing anything. Hence we won't be able to push any aggregates or grouping + * to the data node. + * If it has any SimpleSort in it, then sorting is intended to be applied + * before doing anything. Hence can not push any aggregates or grouping to + * the data node. + */ + if (remote_scan->scan.plan.qual || remote_scan->sort) + return local_plan; /* * Grouping_planner may add Sort node to sort the rows @@ -5183,6 +5192,11 @@ create_remotegrouping_plan(PlannerInfo *root, Plan *local_plan) } } + /* + * At last we find the plan underneath is reducible into a single + * RemoteQuery node. + */ + /* find all the relations referenced by targetlist of Grouping node */ temp_vars = pull_var_clause((Node *)local_plan->targetlist, PVC_REJECT_PLACEHOLDERS); diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 89d3a39..e67cc5e 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -2117,7 +2117,6 @@ transformExecDirectStmt(ParseState *pstate, ExecDirectStmt *stmt) step->sql_statement = NULL; step->exec_nodes = NULL; step->combine_type = COMBINE_TYPE_NONE; - step->simple_aggregates = NIL; step->sort = NULL; step->distinct = NULL; step->read_only = true; diff --git a/src/backend/pgxc/plan/planner.c b/src/backend/pgxc/plan/planner.c index 652008b..52f3a0a 100644 --- a/src/backend/pgxc/plan/planner.c +++ b/src/backend/pgxc/plan/planner.c @@ -1903,7 +1903,6 @@ makeRemoteQuery(void) result->sql_statement = NULL; result->exec_nodes = NULL; result->combine_type = COMBINE_TYPE_NONE; - result->simple_aggregates = NIL; result->sort = NULL; result->distinct = NULL; result->read_only = true; @@ -2018,192 +2017,6 @@ get_plan_combine_type(Query *query, char baselocatortype) return COMBINE_TYPE_NONE; } - -/* - * Get list of simple aggregates used. - */ -static List * -get_simple_aggregates(Query * query) -{ - List *simple_agg_list = NIL; - - /* Check for simple multi-node aggregate */ - if (query->hasAggs) - { - ListCell *lc; - int column_pos = 0; - - foreach (lc, query->targetList) - { - TargetEntry *tle = (TargetEntry *) lfirst(lc); - - if (IsA(tle->expr, Aggref)) - { - /*PGXC borrowed this code from nodeAgg.c, see ExecInitAgg()*/ - SimpleAgg *simple_agg; - Aggref *aggref = (Aggref *) tle->expr; - HeapTuple aggTuple; - Form_pg_aggregate aggform; - Oid aggcollecttype; - AclResult aclresult; - Oid transfn_oid, - finalfn_oid; - Expr *transfnexpr, - *finalfnexpr; - Datum textInitVal; - - simple_agg = makeNode(SimpleAgg); - simple_agg->column_pos = column_pos; - initStringInfo(&simple_agg->valuebuf); - simple_agg->aggref = aggref; - - aggTuple = SearchSysCache(AGGFNOID, - ObjectIdGetDatum(aggref->aggfnoid), - 0, 0, 0); - if (!HeapTupleIsValid(aggTuple)) - elog(ERROR, "cache lookup failed for aggregate %u", - aggref->aggfnoid); - aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple); - - /* Check permission to call aggregate function */ - aclresult = pg_proc_aclcheck(aggref->aggfnoid, GetUserId(), - ACL_EXECUTE); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, ACL_KIND_PROC, - get_func_name(aggref->aggfnoid)); - - simple_agg->transfn_oid = transfn_oid = aggform->aggcollectfn; - simple_agg->finalfn_oid = finalfn_oid = aggform->aggfinalfn; - - /* Check that aggregate owner has permission to call component fns */ - { - HeapTuple procTuple; - Oid aggOwner; - - procTuple = SearchSysCache(PROCOID, - ObjectIdGetDatum(aggref->aggfnoid), - 0, 0, 0); - if (!HeapTupleIsValid(procTuple)) - elog(ERROR, "cache lookup failed for function %u", - aggref->aggfnoid); - aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner; - ReleaseSysCache(procTuple); - - aclresult = pg_proc_aclcheck(transfn_oid, aggOwner, - ACL_EXECUTE); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, ACL_KIND_PROC, - get_func_name(transfn_oid)); - if (OidIsValid(finalfn_oid)) - { - aclresult = pg_proc_aclcheck(finalfn_oid, aggOwner, - ACL_EXECUTE); - if (aclresult != ACLCHECK_OK) - aclcheck_error(aclresult, ACL_KIND_PROC, - get_func_name(finalfn_oid)); - } - } - - /* resolve actual type of transition state, if polymorphic */ - aggcollecttype = aggform->aggcollecttype; - - /* build expression trees using actual argument & result types */ - build_aggregate_fnexprs(&aggform->aggtranstype, - 1, - aggcollecttype, - aggref->aggtype, - transfn_oid, - finalfn_oid, - &transfnexpr, - &finalfnexpr); - - /* Get InputFunction info for transition result */ - { - Oid typinput; - - getTypeInputInfo(aggform->aggtranstype, &typinput, &simple_agg->argioparam); - fmgr_info(typinput, &simple_agg->arginputfn); - } - - /* Get InputFunction info for result */ - { - Oid typoutput; - bool typvarlena; - - getTypeOutputInfo(simple_agg->aggref->aggtype, &typoutput, &typvarlena); - fmgr_info(typoutput, &simple_agg->resoutputfn); - } - - fmgr_info(transfn_oid, &simple_agg->transfn); - simple_agg->transfn.fn_expr = (Node *) transfnexpr; - - if (OidIsValid(finalfn_oid)) - { - fmgr_info(finalfn_oid, &simple_agg->finalfn); - simple_agg->finalfn.fn_expr = (Node *) finalfnexpr; - } - - get_typlenbyval(aggref->aggtype, - &simple_agg->resulttypeLen, - &simple_agg->resulttypeByVal); - get_typlenbyval(aggcollecttype, - &simple_agg->transtypeLen, - &simple_agg->transtypeByVal); - - /* - * initval is potentially null, so don't try to access it as a struct - * field. Must do it the hard way with SysCacheGetAttr. - */ - textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple, - Anum_pg_aggregate_agginitcollect, - &simple_agg->initValueIsNull); - - if (simple_agg->initValueIsNull) - simple_agg->initValue = (Datum) 0; - else - { - Oid typinput, - typioparam; - char *strInitVal; - Datum initVal; - - getTypeInputInfo(aggcollecttype, &typinput, &typioparam); - strInitVal = TextDatumGetCString(textInitVal); - initVal = OidInputFunctionCall(typinput, strInitVal, - typioparam, -1); - pfree(strInitVal); - simple_agg->initValue = initVal; - } - - /* - * If the transfn is strict and the initval is NULL, make sure trans - * type and collect type are the same (or at least binary-compatible), - * so that it's OK to use the first input value as the initial - * transValue. This should have been checked at agg definition time, - * but just in case... - */ - if (simple_agg->transfn.fn_strict && simple_agg->initValueIsNull) - { - if (!IsBinaryCoercible(aggform->aggtranstype, aggcollecttype)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), - errmsg("aggregate %u needs to have compatible transition type and collection type", - aggref->aggfnoid))); - } - - /* PGXCTODO distinct support */ - - ReleaseSysCache(aggTuple); - - simple_agg_list = lappend(simple_agg_list, simple_agg); - } - column_pos++; - } - } - return simple_agg_list; -} - - /* * add_sort_column --- utility subroutine for building sort info arrays * @@ -2966,14 +2779,6 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) query_step->combine_type = get_plan_combine_type( query, query_step->exec_nodes->baselocatortype); - /* Set up simple aggregates */ - /* PGXCTODO - we should detect what types of aggregates are used. - * in some cases we can avoid the final step and merely proxy results - * (when there is only one data node involved) instead of using - * coordinator consolidation. At the moment this is needed for AVG() - */ - query_step->simple_aggregates = get_simple_aggregates(query); - /* * Add sorting to the step */ @@ -3000,7 +2805,7 @@ pgxc_planner(Query *query, int cursorOptions, ParamListInfo boundParams) * distribution the table has. */ if (query->commandType == CMD_SELECT - && (query->groupClause || query->hasWindowFuncs || query->hasRecursive)) + && (query->hasAggs || query->groupClause || query->hasWindowFuncs || query->hasRecursive)) { result = standard_planner(query, cursorOptions, boundParams); return result; @@ -3092,8 +2897,6 @@ free_query_step(RemoteQuery *query_step) if (query_step->exec_nodes->primarynodelist) list_free(query_step->exec_nodes->primarynodelist); } - if (query_step->simple_aggregates != NULL) - list_free_deep(query_step->simple_aggregates); pfree(query_step); } diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 03482a0..ae0d813 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -223,7 +223,6 @@ CreateResponseCombiner(int node_count, CombineType combine_type) combiner->rowBuffer = NIL; combiner->tapenodes = NULL; combiner->initAggregates = true; - combiner->simple_aggregates = NULL; combiner->copy_file = NULL; return combiner; @@ -257,146 +256,6 @@ parse_row_count(const char *message, size_t len, uint64 *rowcount) } /* - * Initialize the collection value, when agregation is first set up, or for a - * new group (grouping support is not implemented yet) - */ -static void -initialize_collect_aggregates(SimpleAgg *simple_agg) -{ - if (simple_agg->initValueIsNull) - simple_agg->collectValue = simple_agg->initValue; - else - simple_agg->collectValue = datumCopy(simple_agg->initValue, - simple_agg->transtypeByVal, - simple_agg->transtypeLen); - simple_agg->noCollectValue = simple_agg->initValueIsNull; - simple_agg->collectValueNull = simple_agg->initValueIsNull; -} - -/* - * Finalize the aggregate after current group or entire relation is processed - * (grouping support is not implemented yet) - */ -static void -finalize_collect_aggregates(SimpleAgg *simple_agg, Datum *resultVal, bool *resultIsNull) -{ - /* - * Apply the agg's finalfn if one is provided, else return collectValue. - */ - if (OidIsValid(simple_agg->finalfn_oid)) - { - FunctionCallInfoData fcinfo; - - InitFunctionCallInfoData(fcinfo, &(simple_agg->finalfn), 1, - (void *) simple_agg, NULL); - fcinfo.arg[0] = simple_agg->collectValue; - fcinfo.argnull[0] = simple_agg->collectValueNull; - if (fcinfo.flinfo->fn_strict && simple_agg->collectValueNull) - { - /* don't call a strict function with NULL inputs */ - *resultVal = (Datum) 0; - *resultIsNull = true; - } - else - { - *resultVal = FunctionCallInvoke(&fcinfo); - *resultIsNull = fcinfo.isnull; - } - } - else - { - *resultVal = simple_agg->collectValue; - *resultIsNull = simple_agg->collectValueNull; - } -} - -/* - * Given new input value(s), advance the transition function of an aggregate. - * - * The new values (and null flags) have been preloaded into argument positions - * 1 and up in fcinfo, so that we needn't copy them again to pass to the - * collection function. No other fields of fcinfo are assumed valid. - * - * It doesn't matter which memory context this is called in. - */ -static void -advance_collect_function(SimpleAgg *simple_agg, FunctionCallInfoData *fcinfo) -{ - Datum newVal; - - if (simple_agg->transfn.fn_strict) - { - /* - * For a strict transfn, nothing happens when there's a NULL input; we - * just keep the prior transValue. - */ - if (fcinfo->argnull[1]) - return; - if (simple_agg->noCollectValue) - { - /* - * result has not been initialized - * We must copy the datum into result if it is pass-by-ref. We - * do not need to pfree the old result, since it's NULL. - * PGXCTODO: in case the transition result type is different from - * collection result type, this code would not work, since we are - * assigning datum of one type to another. For this code to work the - * input and output of collection function needs to be binary - * compatible which is not. So, either check in AggregateCreate, - * that the input and output of collection function are binary - * coercible or set the initial values something non-null or change - * this code - */ - simple_agg->collectValue = datumCopy(fcinfo->arg[1], - simple_agg->transtypeByVal, - simple_agg->transtypeLen); - simple_agg->collectValueNull = false; - simple_agg->noCollectValue = false; - return; - } - if (simple_agg->collectValueNull) - { - /* - * Don't call a strict function with NULL inputs. Note it is - * possible to get here despite the above tests, if the transfn is - * strict *and* returned a NULL on a prior cycle. If that happens - * we will propagate the NULL all the way to the end. - */ - return; - } - } - - /* - * OK to call the transition function - */ - InitFunctionCallInfoData(*fcinfo, &(simple_agg->transfn), 2, (void *) simple_agg, NULL); - fcinfo->arg[0] = simple_agg->collectValue; - fcinfo->argnull[0] = simple_agg->collectValueNull; - newVal = FunctionCallInvoke(fcinfo); - - /* - * If pass-by-ref datatype, must copy the new value into aggcontext and - * pfree the prior transValue. But if transfn returned a pointer to its - * first input, we don't need to do anything. - */ - if (!simple_agg->transtypeByVal && - DatumGetPointer(newVal) != DatumGetPointer(simple_agg->collectValue)) - { - if (!fcinfo->isnull) - { - newVal = datumCopy(newVal, - simple_agg->transtypeByVal, - simple_agg->transtypeLen); - } - if (!simple_agg->collectValueNull) - pfree(DatumGetPointer(simple_agg->collectValue)); - } - - simple_agg->collectValue = newVal; - simple_agg->collectValueNull = fcinfo->isnull; -} - -/* * Convert RowDescription message to a TupleDesc */ static TupleDesc @@ -460,64 +319,6 @@ create_tuple_desc(char *msg_body, size_t len) return result; } -static void -exec_simple_aggregates(RemoteQueryState *combiner, TupleTableSlot *slot) -{ - ListCell *lc; - - Assert(combiner->simple_aggregates); - Assert(!TupIsNull(slot)); - - if (combiner->initAggregates) - { - foreach (lc, combiner->simple_aggregates) - initialize_collect_aggregates((SimpleAgg *) lfirst(lc)); - - combiner->initAggregates = false; - } - - foreach (lc, combiner->simple_aggregates) - { - SimpleAgg *simple_agg = (SimpleAgg *) lfirst(lc); - FunctionCallInfoData fcinfo; - int attr = simple_agg->column_pos; - - slot_getsomeattrs(slot, attr + 1); - fcinfo.arg[1] = slot->tts_values[attr]; - fcinfo.argnull[1] = slot->tts_isnull[attr]; - - advance_collect_function(simple_agg, &fcinfo); - } -} - -static void -finish_simple_aggregates(RemoteQueryState *combiner, TupleTableSlot *slot) -{ - ListCell *lc; - ExecClearTuple(slot); - - /* - * Aggregates may not been initialized if no rows has been received - * from the data nodes because of HAVING clause. - * In this case finish_simple_aggregates() should return empty slot - */ - if (!combiner->initAggregates) - { - foreach (lc, combiner->simple_aggregates) - { - SimpleAgg *simple_agg = (SimpleAgg *) lfirst(lc); - int attr = simple_agg->column_pos; - - finalize_collect_aggregates(simple_agg, - slot->tts_values + attr, - slot->tts_isnull + attr); - } - ExecStoreVirtualTuple(slot); - /* To prevent aggregates get finalized again */ - combiner->initAggregates = true; - } -} - /* * Handle CopyOutCommandComplete ('c') message from a data node connection */ @@ -1034,7 +835,6 @@ ValidateAndResetCombiner(RemoteQueryState *combiner) combiner->currentRow.msgnode = 0; combiner->rowBuffer = NIL; combiner->tapenodes = NULL; - combiner->simple_aggregates = NULL; combiner->copy_file = NULL; return valid; @@ -2913,7 +2713,6 @@ ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) remotestate = CreateResponseCombiner(0, node->combine_type); remotestate->ss.ps.plan = (Plan *) node; remotestate->ss.ps.state = estate; - remotestate->simple_aggregates = node->simple_aggregates; remotestate->ss.ps.qual = (List *) ExecInitExpr((Expr *) node->scan.plan.qual, @@ -2990,6 +2789,8 @@ ExecInitRemoteQuery(RemoteQuery *node, EState *estate, int eflags) ExecAssignExprContext(estate, &remotestate->ss.ps); } } + else if (remotestate->ss.ps.qual) + ExecAssignExprContext(estate, &remotestate->ss.ps); if (innerPlan(node)) innerPlanState(remotestate) = ExecInitNode(innerPlan(node), estate, eflags); @@ -3691,6 +3492,8 @@ ExecRemoteQuery(RemoteQueryState *node) TupleTableSlot *resultslot = node->ss.ps.ps_ResultTupleSlot; TupleTableSlot *scanslot = node->ss.ss_ScanTupleSlot; bool have_tuple = false; + List *qual = node->ss.ps.qual; + ExprContext *econtext = node->ss.ps.ps_ExprContext; if (!node->query_Done) { @@ -3757,7 +3560,15 @@ handle_results: while (tuplesort_gettupleslot((Tuplesortstate *) node->tuplesortstate, true, scanslot)) { - have_tuple = true; + if (qual) + econtext->ecxt_scantuple = scanslot; + if (!qual || ExecQual(qual, econtext, false)) + have_tuple = true; + else + { + have_tuple = false; + continue; + } /* * If DISTINCT is specified and current tuple matches to * previous skip it and get next one. @@ -3791,16 +3602,9 @@ handle_results: { while (FetchTuple(node, scanslot) && !TupIsNull(scanslot)) { - if (node->simple_aggregates) - { - /* - * Advance aggregate functions and allow to read up next - * data row message and get tuple in the same slot on - * next iteration - */ - exec_simple_aggregates(node, scanslot); - } - else + if (qual) + econtext->ecxt_scantuple = scanslot; + if (!qual || ExecQual(qual, econtext, false)) { /* * Receive current slot and read up next data row @@ -3814,17 +3618,6 @@ handle_results: } } - /* - * We may need to finalize aggregates - */ - if (node->simple_aggregates) - { - finish_simple_aggregates(node, resultslot); - - if (!TupIsNull(resultslot)) - have_tuple = true; - } - if (!have_tuple) /* report end of scan */ ExecClearTuple(resultslot); } diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 5f5b947..ad33074 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -79,7 +79,6 @@ typedef enum NodeTag * TAGS FOR PGXC NODES (planner.h, locator.h) */ T_ExecNodes, - T_SimpleAgg, T_SimpleSort, T_SimpleDistinct, T_RemoteQuery, diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index fb9232f..e522ba9 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -101,13 +101,6 @@ typedef struct RemoteQueryState * to initialize collecting of aggregates from the DNs */ bool initAggregates; - /* - * PGXCTODO - - * we should get rid of the simple_aggregates member, that should work - * through Agg node and grouping_planner should take care of optimizing it - * to the fullest - */ - List *simple_aggregates; /* description of aggregate functions */ void *tuplesortstate; /* for merge sort */ /* Simple DISTINCT support */ FmgrInfo *eqfunctions; /* functions to compare tuples */ diff --git a/src/include/pgxc/planner.h b/src/include/pgxc/planner.h index 96a2a68..1a18c9f 100644 --- a/src/include/pgxc/planner.h +++ b/src/include/pgxc/planner.h @@ -93,7 +93,6 @@ typedef struct char *sql_statement; ExecNodes *exec_nodes; /* List of Datanodes where to launch query */ CombineType combine_type; - List *simple_aggregates; /* simple aggregate to combine on this step */ SimpleSort *sort; SimpleDistinct *distinct; bool read_only; /* do not use 2PC when committing read only steps */ @@ -122,73 +121,6 @@ typedef struct char *join_condition; } RemoteQuery; - -/* - * For handling simple aggregates (no group by present) - * For now, only MAX will be supported. - */ -typedef enum -{ - AGG_TYPE_MAX, - AGG_TYPE_MIN, - AGG_TYPE_COUNT, - AGG_TYPE_SUM, - AGG_TYPE_AVG -} SimpleAggType; - - -/* For handling simple aggregates */ -typedef struct -{ - NodeTag type; - int column_pos; /* Only use 1 for now */ - Aggref *aggref; - Oid transfn_oid; - Oid finalfn_oid; - - /* Input Functions, to parse arguments coming from the data nodes */ - FmgrInfo arginputfn; - Oid argioparam; - - /* Output Function, to encode result to present to client */ - FmgrInfo resoutputfn; - - /* - * fmgr lookup data for transfer functions --- only valid when - * corresponding oid is not InvalidOid. Note in particular that fn_strict - * flags are kept here. - */ - FmgrInfo transfn; - FmgrInfo finalfn; - - /* - * initial value from pg_aggregate entry - */ - Datum initValue; - bool initValueIsNull; - - /* - * We need the len and byval info for the agg's input, result, and - * transition data types in order to know how to copy/delete values. - */ - int16 inputtypeLen, - resulttypeLen, - transtypeLen; - bool inputtypeByVal, - resulttypeByVal, - transtypeByVal; - - /* - * State of current group - */ - bool noCollectValue; - Datum collectValue; - bool collectValueNull; - - /* a value buffer to avoid multiple allocations */ - StringInfoData valuebuf; -} SimpleAgg; - typedef struct { bool partitioned_replicated; diff --git a/src/test/regress/expected/create_index_1.out b/src/test/regress/expected/create_index_1.out index ab3807c..67d3939 100644 --- a/src/test/regress/expected/create_index_1.out +++ b/src/test/regress/expected/create_index_1.out @@ -244,10 +244,12 @@ LINE 1: SELECT count(*) FROM gcircle_tbl WHERE f1 && '<(500,500),500... ^ EXPLAIN (COSTS OFF) SELECT count(*) FROM point_tbl WHERE f1 <@ box '(0,0,100,100)'; - QUERY PLAN ---------------------------------- - Data Node Scan (Node Count [1]) -(1 row) + QUERY PLAN +--------------------------------------------- + Aggregate + -> Materialize + -> Data Node Scan (Node Count [1]) +(3 rows) SELECT count(*) FROM point_tbl WHERE f1 <@ box '(0,0,100,100)'; count @@ -257,10 +259,12 @@ SELECT count(*) FROM point_tbl WHERE f1 <@ box '(0,0,100,100)'; EXPLAIN (COSTS OFF) SELECT count(*) FROM point_tbl WHERE box '(0,0,100,100)' @> f1; - QUERY PLAN ---------------------------------- - Data Node Scan (Node Count [1]) -(1 row) + QUERY PLAN +--------------------------------------------- + Aggregate + -> Materialize + -> Data Node Scan (Node Count [1]) +(3 rows) SELECT count(*) FROM point_tbl WHERE box '(0,0,100,100)' @> f1; count @@ -270,10 +274,12 @@ SELECT count(*) FROM point_tbl WHERE box '(0,0,100,100)' @> f1; EXPLAIN (COSTS OFF) SELECT count(*) FROM point_tbl WHERE f1 <@ polygon '(0,0),(0,100),(100,100),(50,50),(100,0),(0,0)'; - QUERY PLAN ---------------------------------- - Data Node Scan (Node Count [1]) -(1 row) + QUERY PLAN +--------------------------------------------- + Aggregate + -> Materialize + -> Data Node Scan (Node Count [1]) +(3 rows) SELECT count(*) FROM point_tbl WHERE f1 <@ polygon '(0,0),(0,100),(100,100),(50,50),(100,0),(0,0)'; count @@ -283,10 +289,12 @@ SELECT count(*) FROM point_tbl WHERE f1 <@ polygon '(0,0),(0,100),(100,100),(50, EXPLAIN (COSTS OFF) SELECT count(*) FROM point_tbl WHERE f1 <@ circle '<(50,50),50>'; - QUERY PLAN ---------------------------------- - Data Node Scan (Node Count [1]) -(1 row) + QUERY PLAN +--------------------------------------------- + Aggregate + -> Materialize + -> Data Node Scan (Node Count [1]) +(3 rows) SELECT count(*) FROM point_tbl WHERE f1 <@ circle '<(50,50),50>'; count @@ -296,10 +304,12 @@ SELECT count(*) FROM point_tbl WHERE f1 <@ circle '<(50,50),50>'; EXPLAIN (COSTS OFF) SELECT count(*) FROM point_tbl p WHERE p.f1 << '(0.0, 0.0)'; - QUERY PLAN ---------------------------------- - Data Node Scan (Node Count [1]) -(1 row) + QUERY PLAN +--------------------------------------------- + Aggregate + -> Materialize + -> Data Node Scan (Node Count [1]) +(3 rows) SELECT count(*) FROM point_tbl p WHERE p.f1 << '(0.0, 0.0)'; count @@ -309,10 +319,12 @@ SELECT count(*) FROM point_tbl p WHERE p.f1 << '(0.0, 0.0)'; EXPLAIN (COSTS OFF) SELECT count(*) FROM point_tbl p WHERE p.f1 >> '(0.0, 0.0)'; - QUERY PLAN ---------------------------------- - Data Node Scan (Node Count [1]) -(1 row) + QUERY PLAN +--------------------------------------------- + Aggregate + -> Materialize + -> Data Node Scan (Node Count [1]) +(3 rows) SELECT count(*) FROM point_tbl p WHERE p.f1 >> '(0.0, 0.0)'; count @@ -322,10 +334,12 @@ SELECT count(*) FROM point_tbl p WHERE p.f1 >> '(0.0, 0.0)'; EXPLAIN (COSTS OFF) SELECT count(*) FROM point_tbl p WHERE p.f1 <^ '(0.0, 0.0)'; - QUERY PLAN ---------------------------------- - Data Node Scan (Node Count [1]) -(1 row) + QUERY PLAN +--------------------------------------------- + Aggregate + -> Materialize + -> Data Node Scan (Node Count [1]) +(3 rows) SELECT count(*) FROM point_tbl p WHERE p.f1 <^ '(0.0, 0.0)'; count @@ -335,10 +349,12 @@ SELECT count(*) FROM point_tbl p WHERE p.f1 <^ '(0.0, 0.0)'; EXPLAIN (COSTS OFF) SELECT count(*) FROM point_tbl p WHERE p.f1 >^ '(0.0, 0.0)'; - QUERY PLAN ---------------------------------- - Data Node Scan (Node Count [1]) -(1 row) + QUERY PLAN +--------------------------------------------- + Aggregate + -> Materialize + -> Data Node Scan (Node Count [1]) +(3 rows) SELECT count(*) FROM point_tbl p WHERE p.f1 >^ '(0.0, 0.0)'; count @@ -348,10 +364,12 @@ SELECT count(*) FROM point_tbl p WHERE p.f1 >^ '(0.0, 0.0)'; EXPLAIN (COSTS OFF) SELECT count(*) FROM point_tbl p WHERE p.f1 ~= '(-5, -12)'; - QUERY PLAN ---------------------------------- - Data Node Scan (Node Count [1]) -(1 row) + QUERY PLAN +--------------------------------------------- + Aggregate + -> Materialize + -> Data Node Scan (Node Count [1]) +(3 rows) SELECT count(*) FROM point_tbl p WHERE p.f1 ~= '(-5, -12)'; count ----------------------------------------------------------------------- Summary of changes: src/backend/executor/nodeAgg.c | 34 ++-- src/backend/nodes/copyfuncs.c | 38 ---- src/backend/optimizer/plan/createplan.c | 18 ++- src/backend/parser/analyze.c | 1 - src/backend/pgxc/plan/planner.c | 199 +--------------------- src/backend/pgxc/pool/execRemote.c | 239 ++------------------------ src/include/nodes/nodes.h | 1 - src/include/pgxc/execRemote.h | 7 - src/include/pgxc/planner.h | 68 -------- src/test/regress/expected/create_index_1.out | 90 ++++++---- 10 files changed, 102 insertions(+), 593 deletions(-) hooks/post-receive -- Postgres-XC |
From: Michael P. <mic...@us...> - 2011-06-16 00:08:00
|
Project "Postgres-XC". The branch, master has been updated via 88a19b42f3b599927b74e0aef2e19b9161b3a7eb (commit) from bcfa7b115f2a76e0146016dee36995df6eab89d0 (commit) - Log ----------------------------------------------------------------- commit 88a19b42f3b599927b74e0aef2e19b9161b3a7eb Author: Michael P <mic...@us...> Date: Thu Jun 16 09:05:43 2011 +0900 Support for START TRANSACTION Until now XC was just sending down to Datanodes a plain BEGIN query each time a transaction was begun on backend nodes. This commit extends support of transaction start with isolation level and read operation customizations. Example of queries: START TRANSACTION ISOLATION LEVEL READ COMMITTED; START TRANSACTION READ WRITE; Regression test transaction is now fixed. diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index ee02109..03482a0 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -53,6 +53,7 @@ static bool is_ddl = false; static bool implicit_force_autocommit = false; static PGXCNodeHandle **write_node_list = NULL; static int write_node_count = 0; +static char *begin_string = NULL; static int pgxc_node_begin(int conn_count, PGXCNodeHandle ** connections, GlobalTransactionId gxid); @@ -1536,8 +1537,16 @@ pgxc_node_begin(int conn_count, PGXCNodeHandle ** connections, if (GlobalTimestampIsValid(timestamp) && pgxc_node_send_timestamp(connections[i], timestamp)) return EOF; - if (pgxc_node_send_query(connections[i], "BEGIN")) - return EOF; + if (begin_string) + { + if (pgxc_node_send_query(connections[i], begin_string)) + return EOF; + } + else + { + if (pgxc_node_send_query(connections[i], "BEGIN")) + return EOF; + } } combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE); @@ -1573,6 +1582,22 @@ PGXCNodeBegin(void) clear_write_node_list(); } +void +PGXCNodeSetBeginQuery(char *query_string) +{ + int len; + + if (!query_string) + return; + + len = strlen(query_string); + /* + * This query string is sent to backend nodes, + * it contains serializable and read options + */ + begin_string = (char *)malloc(len + 1); + begin_string = memcpy(begin_string, query_string, len + 1); +} /* * Prepare transaction on Datanodes and Coordinators involved in current transaction. @@ -1624,6 +1649,11 @@ finish: if (!PersistentConnections) release_handles(); autocommit = true; + if (begin_string) + { + free(begin_string); + begin_string = NULL; + } is_ddl = false; clear_write_node_list(); @@ -1898,6 +1928,11 @@ finish: if (!PersistentConnections && res == 0) release_handles(); autocommit = true; + if (begin_string) + { + free(begin_string); + begin_string = NULL; + } is_ddl = false; clear_write_node_list(); @@ -2034,6 +2069,11 @@ finish: if (!PersistentConnections) release_handles(); autocommit = true; + if (begin_string) + { + free(begin_string); + begin_string = NULL; + } is_ddl = false; clear_write_node_list(); @@ -2158,6 +2198,11 @@ finish: if (!PersistentConnections) release_handles(); autocommit = true; + if (begin_string) + { + free(begin_string); + begin_string = NULL; + } is_ddl = false; clear_write_node_list(); @@ -2250,6 +2295,11 @@ finish: if (!PersistentConnections && bReleaseHandles) release_handles(); autocommit = true; + if (begin_string) + { + free(begin_string); + begin_string = NULL; + } is_ddl = false; clear_write_node_list(); @@ -2319,6 +2369,11 @@ finish: if (!PersistentConnections) release_handles(); autocommit = true; + if (begin_string) + { + free(begin_string); + begin_string = NULL; + } is_ddl = false; clear_write_node_list(); diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 9e2e9d4..e9e7afd 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -370,13 +370,10 @@ standard_ProcessUtility(Node *parsetree, { ListCell *lc; #ifdef PGXC - /* - * If a COMMIT PREPARED message is received from another Coordinator, - * Don't send it down to Datanodes. - */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) PGXCNodeBegin(); #endif + BeginTransactionBlock(); foreach(lc, stmt->options) { @@ -391,6 +388,33 @@ standard_ProcessUtility(Node *parsetree, list_make1(item->arg), true); } + +#ifdef PGXC + /* + * Now that all the local variables have been set, + * it is time to rebuild the query. + */ + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + { + char *begin_string = NULL; + + /* Result is palloc'd */ + foreach(lc, stmt->options) + { + DefElem *item = (DefElem *) lfirst(lc); + + if (strcmp(item->defname, "transaction_isolation") == 0) + begin_string = RewriteBeginQuery(begin_string, + "transaction_isolation", + list_make1(item->arg)); + else if (strcmp(item->defname, "transaction_read_only") == 0) + begin_string = RewriteBeginQuery(begin_string, + "transaction_read_only", + list_make1(item->arg)); + } + PGXCNodeSetBeginQuery(begin_string); + } +#endif } break; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 869a172..4d1c2a6 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -8320,4 +8320,44 @@ assign_application_name(const char *newval, bool doit, GucSource source) return newval; } +#ifdef PGXC +/* + * RewriteBeginQuery + * + * Rewrite transaction start query depending on the isolation level + * and read operation options. + */ +char * +RewriteBeginQuery(char *query_string, const char *name, List *args) +{ + char *value = GetConfigOptionByName(name, NULL); + + if (!query_string) + { + query_string = (char *)palloc(18); + sprintf(query_string, "START TRANSACTION"); + } + + if (strcmp(name, "transaction_isolation") == 0) + { + query_string = (char *)repalloc(query_string, strlen(query_string) + strlen(value) + 18); + sprintf(query_string, "%s ISOLATION LEVEL %s", query_string, value); + } + else if (strcmp(name, "transaction_read_only") == 0) + { + char buffer[512]; + if (strcmp(value, "on") == 0) + sprintf(buffer, "READ ONLY"); + else + sprintf(buffer, "READ WRITE"); + + query_string = (char *)repalloc(query_string, strlen(query_string) + strlen(buffer) + 2); + sprintf(query_string, "%s %s", query_string, buffer); + } + + pfree(value); + return query_string; +} +#endif + #include "guc-file.c" diff --git a/src/include/pgxc/execRemote.h b/src/include/pgxc/execRemote.h index 405325b..fb9232f 100644 --- a/src/include/pgxc/execRemote.h +++ b/src/include/pgxc/execRemote.h @@ -123,6 +123,7 @@ typedef struct RemoteQueryState /* Multinode Executor */ extern void PGXCNodeBegin(void); +extern void PGXCNodeSetBeginQuery(char *query_string); extern void PGXCNodeCommit(bool bReleaseHandles); extern int PGXCNodeRollback(void); extern bool PGXCNodePrepare(char *gid); diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 9eb37b8..6192d37 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -277,6 +277,10 @@ extern void SetPGVariable(const char *name, List *args, bool is_local); extern void GetPGVariable(const char *name, DestReceiver *dest); extern TupleDesc GetPGVariableResultDesc(const char *name); +#ifdef PGXC +extern char *RewriteBeginQuery(char *query_string, const char *name, List *args); +#endif + extern void ExecSetVariableStmt(VariableSetStmt *stmt); extern char *ExtractSetVariableArgs(VariableSetStmt *stmt); ----------------------------------------------------------------------- Summary of changes: src/backend/pgxc/pool/execRemote.c | 59 ++++++++++++++++++++++++++++++++++- src/backend/tcop/utility.c | 32 +++++++++++++++++-- src/backend/utils/misc/guc.c | 40 ++++++++++++++++++++++++ src/include/pgxc/execRemote.h | 1 + src/include/utils/guc.h | 4 ++ 5 files changed, 130 insertions(+), 6 deletions(-) hooks/post-receive -- Postgres-XC |
From: Ashutosh B. <ash...@us...> - 2011-06-15 04:29:19
|
Project "Postgres-XC". The branch, master has been updated via bcfa7b115f2a76e0146016dee36995df6eab89d0 (commit) from ca34131f203bfc21c57b928b0fc3fd6ccef0195c (commit) - Log ----------------------------------------------------------------- commit bcfa7b115f2a76e0146016dee36995df6eab89d0 Author: Ashutosh Bapat <ash...@en...> Date: Wed Jun 15 09:46:47 2011 +0530 Fix for bug 3286054. The aggregates like count() can return integral value 0, which is equivalent to (Datum)NULL. We should not return NULL tuple when we encounter integral value 0 as aggregation result in ExecRemoteQuery(). For aggregate count(), the initial collection value should be 0 just like the initial transition value. diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 0a2e6de..ee02109 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -3764,22 +3764,8 @@ handle_results: */ if (node->simple_aggregates) { - int i, natts; - finish_simple_aggregates(node, resultslot); - /* - * PGXCTODO :In fact exec_simple_aggregates & finish_simple_aggregates - * should not be resulting in a TupleTableSlot with NULL pointer in - * per attribute value, but for now to fix the crash this check would do - */ - natts = resultslot->tts_tupleDescriptor->natts; - for (i = 0; i < natts; ++i) - { - if (resultslot->tts_values[i] == (Datum) NULL) - return NULL; - } - if (!TupIsNull(resultslot)) have_tuple = true; } diff --git a/src/include/catalog/pg_aggregate.h b/src/include/catalog/pg_aggregate.h index 5c98863..d53a632 100644 --- a/src/include/catalog/pg_aggregate.h +++ b/src/include/catalog/pg_aggregate.h @@ -241,8 +241,8 @@ DATA(insert ( 3527 enum_smaller enum_smaller - 3518 3500 3500 _null_ _null_ ) /* count */ /* Final function is data type conversion function numeric_int8 is refernced by OID because of ambiguous defininition in pg_proc */ #ifdef PGXC -DATA(insert ( 2147 int8inc_any int8_sum_to_int8 - 0 20 20 "0" _null_ )); -DATA(insert ( 2803 int8inc int8_sum_to_int8 - 0 20 20 "0" _null_ )); +DATA(insert ( 2147 int8inc_any int8_sum_to_int8 - 0 20 20 "0" "0" )); +DATA(insert ( 2803 int8inc int8_sum_to_int8 - 0 20 20 "0" "0" )); #endif #ifdef PGXC //DATA(insert ( 2147 int8inc_any - 0 20 "0" )); diff --git a/src/test/regress/expected/tsearch_1.out b/src/test/regress/expected/tsearch_1.out index 4d1f1b1..94238cb 100644 --- a/src/test/regress/expected/tsearch_1.out +++ b/src/test/regress/expected/tsearch_1.out @@ -1045,26 +1045,30 @@ DETAIL: The feature is not currently supported SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty'); count ------- -(0 rows) + 0 +(1 row) INSERT INTO test_tsvector (t) VALUES ('345 qwerty'); SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty'); count ------- -(0 rows) + 0 +(1 row) UPDATE test_tsvector SET t = null WHERE t = '345 qwerty'; ERROR: Partition column can't be updated in current version SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty'); count ------- -(0 rows) + 0 +(1 row) INSERT INTO test_tsvector (t) VALUES ('345 qwerty'); SELECT count(*) FROM test_tsvector WHERE a @@ to_tsquery('345&qwerty'); count ------- -(0 rows) + 0 +(1 row) -- test finding items in GIN's pending list create table pendtest (ts tsvector); ----------------------------------------------------------------------- Summary of changes: src/backend/pgxc/pool/execRemote.c | 14 -------------- src/include/catalog/pg_aggregate.h | 4 ++-- src/test/regress/expected/tsearch_1.out | 12 ++++++++---- 3 files changed, 10 insertions(+), 20 deletions(-) hooks/post-receive -- Postgres-XC |
From: Ashutosh B. <ash...@us...> - 2011-06-14 09:44:16
|
Project "Postgres-XC". The branch, master has been updated via ca34131f203bfc21c57b928b0fc3fd6ccef0195c (commit) from 1e3498c1a95c36b4bed96287d4209c0ee049db9d (commit) - Log ----------------------------------------------------------------- commit ca34131f203bfc21c57b928b0fc3fd6ccef0195c Author: Ashutosh Bapat <ash...@en...> Date: Tue Jun 14 15:07:48 2011 +0530 Push aggregations and GROUP BY clause to the datanode when query has aggregates and sorting is used for grouping. The function create_remoteagg_plan() and create_remotegroup_plan() are combined into a single function create_remotegrouping_plan() which now takes care of all the optimizations related to grouping plans in XC. This commit is based upon the last two commits related to the GROUP BY optimizations. The optimizations work under the same restriction as the previous two commits. More tests are added to xc_groupby test, to cover all combinations of {replicated, distributed tables} X {grouping by sorting, hashing}. diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 97016f9..a666914 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -41,6 +41,7 @@ #include "catalog/pg_proc.h" #include "catalog/pg_type.h" #include "executor/executor.h" +#include "rewrite/rewriteManip.h" #endif #include "utils/lsyscache.h" @@ -89,6 +90,8 @@ static Alias *generate_remote_rte_alias(RangeTblEntry *rte, int varno, char *aliasname, int reduce_level); static void pgxc_locate_grouping_columns(PlannerInfo *root, List *tlist, AttrNumber *grpColIdx); +static List *pgxc_process_grouping_targetlist(PlannerInfo *root, + List **local_tlist); #endif static NestLoop *create_nestloop_plan(PlannerInfo *root, NestPath *best_path, Plan *outer_plan, Plan *inner_plan); @@ -5056,7 +5059,7 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) } /* - * create_remoteagg_plan + * create_remotegrouping_plan * Check if the grouping and aggregates can be pushed down to the * datanodes. * Right now we can push with following restrictions @@ -5064,6 +5067,7 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) * expressions in group by clauses * 2. No distinct or order by clauses * 3. No windowing clause + * 4. No having clause * * Inputs * root - planerInfo root for this query @@ -5075,306 +5079,14 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) * node in case there are no local clauses. */ Plan * -create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) +create_remotegrouping_plan(PlannerInfo *root, Plan *local_plan) { Query *query = root->parse; - RemoteQuery *agg_left; - Plan *temp_plan = local_plan->lefttree; - List *agg_tlist = local_plan->targetlist; - StringInfo remote_sql_stmt = makeStringInfo(); - StringInfo remote_targetlist = makeStringInfo(); - StringInfo remote_fromlist = makeStringInfo(); - StringInfo groupby_clause = makeStringInfo(); - StringInfo in_alias = makeStringInfo(); - ListCell *temp; - ListCell *temp_remote; - RemoteQuery *agg_remote; - Plan *agg_remote_plan; - RangeTblEntry *dummy_rte; - Index dummy_rtindex; - List *base_tlist; - Agg *agg_plan = NULL; - List *temp_vars; /* temporarily hold the VARs */ - List *temp_vartlist; /* temporarity hold tlist of VARs */ - Relids in_relids; /* the list of Relids referenced by - * the Agg plan - */ - - /* For now only Agg plans */ - Assert(IsA(local_plan, Agg)); - agg_plan = (Agg *)local_plan; - - /* - * We don't push aggregation and grouping to datanodes, in case there are - * windowing aggregates, distinct, having clause or sort clauses. - */ - if (query->hasWindowFuncs || - query->distinctClause || - query->sortClause || - query->havingQual) - return local_plan; - - /* - * Optimize if only the tree underneath is reduced to RemoteQuery, any other - * node there indicates that the scans can not be completely pushed to the - * remote data nodes. - * RemoteQuery is hidden underneath Material plan, take it out. - */ - if (IsA(temp_plan, Material)) - temp_plan = temp_plan->lefttree; - if (!IsA(temp_plan, RemoteQuery)) - return local_plan; - else - agg_left = (RemoteQuery *)temp_plan; - - /* - * Walk through the target list and find out whether we can push the - * aggregates and grouping to datanodes. We can do so if the target list - * contains plain aggregates (without any expression involving those) and - * expressions in group by clauses only (last one to make the query legit. - */ - foreach(temp, agg_tlist) - { - TargetEntry *tle = lfirst(temp); - Node *expr = (Node *)tle->expr; - - /* - * PGXCTODO: once we allow sort clauses to be pushed to data nodes, - * along with group by clause, this condition will need to be changed. - */ - if (!(IsA(expr, Aggref) || tle->ressortgroupref > 0)) - return local_plan; - } - - /* - * Cleared of all the charges, now take following steps - * 1. Create a remote query node reflecting the query to be pushed to the - * datanode - * 2. Modify the Agg node passed in so that it reflects the aggregation - * (collection) to be done at the coordinator based on the results sent by - * the datanodes. - */ - appendStringInfo(in_alias, "%s_%d", "group", root->rs_alias_index); - - /* Find all the relations referenced by targetlist of Agg node */ - temp_vars = pull_var_clause((Node *)agg_tlist, PVC_REJECT_PLACEHOLDERS); - findReferencedVars(temp_vars, (Plan *)agg_left, &temp_vartlist, &in_relids); - - /* - * Build partial RemoteQuery node to be used for creating the Select clause - * to be sent to the remote node. Rest of the node will be built later - */ - agg_remote = makeNode(RemoteQuery); - - /* - * Save information about the plan we are reducing. - * We may need this information later if more entries are added to it - * as part of the remote expression optimization. - */ - agg_remote->remotejoin = false; - agg_remote->inner_alias = pstrdup(in_alias->data); - agg_remote->inner_reduce_level = agg_left->reduce_level; - agg_remote->inner_relids = in_relids; - agg_remote->inner_statement = pstrdup(agg_left->sql_statement); - agg_remote->exec_nodes = agg_left->exec_nodes; - - /* Don't forget to increment the index for the next time around! */ - agg_remote->reduce_level = root->rs_alias_index++; - - /* Generate the select clause of the remote query */ - appendStringInfoString(remote_targetlist, "SELECT"); - foreach (temp, agg_tlist) - { - TargetEntry *tle = lfirst(temp); - Node *expr = (Node *)tle->expr; - - create_remote_expr(root, local_plan, remote_targetlist, expr, agg_remote); - - /* If this is not last target entry, add a comma with space */ - if (lnext(temp)) - appendStringInfoString(remote_targetlist, ","); - } - - /* Generate the from clause of the remote query */ - appendStringInfo(remote_fromlist, "FROM (%s) %s", - agg_remote->inner_statement, agg_remote->inner_alias); - - /* - * Generate group by clause for the remote query and recompute the group by - * columE.n locations - */ - if (query->groupClause) - { - int cntCols; - Assert(IsA(local_plan, Agg)); - - /* - * Recompute the column ids of the grouping columns, - * the group column indexes computed earlier point in the - * targetlists of the scan plans under this node. But now the grouping - * column indexes will be pointing in the targetlist of the new - * RemoteQuery, hence those need to be recomputed. - */ - pgxc_locate_grouping_columns(root, agg_tlist, agg_plan->grpColIdx); - - appendStringInfoString(groupby_clause, "GROUP BY "); - for (cntCols = 0; cntCols < agg_plan->numCols; cntCols++) - { - appendStringInfo(groupby_clause, "%d", - agg_plan->grpColIdx[cntCols]); - if (cntCols < agg_plan->numCols - 1) - appendStringInfoString(groupby_clause, ", "); - } - } - - /* Generate the remote sql statement from the pieces */ - appendStringInfo(remote_sql_stmt, "%s %s %s", remote_targetlist->data, - remote_fromlist->data, groupby_clause->data); - - /* - * Set the base_tlist for the RemoteQuery node being created, it's used to - * create the tuple descriptor for the result from RemoteQuery and rewrite - * the Aggregates targetlist accept the results of the RemoteQuery. - */ - base_tlist = add_to_flat_tlist(NIL, get_tlist_exprs(agg_tlist, true)); - - /* - * We need to change the return types of the aggregates. Datanodes send the - * aggregation results in the form of transition results. - */ - foreach (temp, base_tlist) - { - TargetEntry *tle = lfirst(temp); - Node *expr = (Node *)tle->expr; - Aggref *agg; - - if (IsA(expr, Aggref)) - { - agg = (Aggref *)expr; - agg->aggtype = agg->aggtrantype; - } - } - - /* - * Create a dummy RTE for the remote query being created. Append the dummy - * range table entry to the range table. Note that this modifies the master - * copy the caller passed us, otherwise e.g EXPLAIN VERBOSE will fail to - * find the rte the Vars built below refer to. Also create the tuple - * descriptor for the result of this query from the base_tlist (targetlist - * we used to generate the remote node query). - */ - dummy_rte = makeNode(RangeTblEntry); - dummy_rte->reltupdesc = ExecTypeFromTL(base_tlist, false); - dummy_rte->rtekind = RTE_RELATION; - - /* Use a dummy relname... */ - dummy_rte->relname = "__FOREIGN_QUERY__"; - dummy_rte->eref = makeAlias("__FOREIGN_QUERY__", NIL); - - /* Rest will be zeroed out in makeNode() */ - root->parse->rtable = lappend(root->parse->rtable, dummy_rte); - dummy_rtindex = list_length(root->parse->rtable); - - /* - * Change the aggref nodes in the local Agg plan to accept the transition - * results from the remote query output. Do this after we have created base - * list, otherwise we might introduce these changes in the base list. - * Do this after the RTE for the remote query is added to the root. - */ - forboth (temp, agg_tlist, temp_remote, base_tlist) - { - TargetEntry *tle = lfirst(temp); - Node *expr = (Node *)tle->expr; - Aggref *agg; - TargetEntry *tle_remote = lfirst(temp_remote); - Node *expr_remote = (Node *)tle_remote->expr; - - if (IsA(expr, Aggref)) - { - Assert(IsA(expr_remote, Aggref)); - - /* - * Replace the args of the local Aggref with Aggref node to be - * included in RemoteQuery node, so that set_plan_refs can convert - * the args into VAR pointing to the appropriate result in the tuple - * coming from RemoteQuery node. - * PGXCTODO: should we push this change in targetlists of plans - * above? - */ - agg = (Aggref *)expr; - agg->args = list_make1(makeTargetEntry(copyObject(expr_remote), 1, NULL, false)); - } - } - - /* Build rest of the RemoteQuery node and the plan there */ - agg_remote_plan = &agg_remote->scan.plan; - - /* The join targetlist becomes this node's tlist */ - agg_remote_plan->targetlist = base_tlist; - agg_remote_plan->lefttree = NULL; - agg_remote_plan->righttree = NULL; - agg_remote->scan.scanrelid = dummy_rtindex; - agg_remote->sql_statement = remote_sql_stmt->data; - - /* set_plan_refs needs this later */ - agg_remote->base_tlist = base_tlist; - agg_remote->relname = "__FOREIGN_QUERY__"; - agg_remote->partitioned_replicated = agg_left->partitioned_replicated; - - /* - * Only quals that can be pushed to the remote side the ones in the having - * clause. Till we work out how to handle having quals in XC, we don't have - * any quals here. - * PGXCTODO: the RemoteQuery node that was earlier the lefttree of Agg - * node, may have local quals. In such case, we have to aggregate and group - * at coordinator and can not push the grouping clause to the datanodes. Is - * there a case in XC, where we can have local quals? - * We actually need not worry about costs since this is the final plan. - */ - agg_remote_plan->startup_cost = agg_left->scan.plan.startup_cost; - agg_remote_plan->total_cost = agg_left->scan.plan.total_cost; - agg_remote_plan->plan_rows = agg_left->scan.plan.plan_rows; - agg_remote_plan->plan_width = agg_left->scan.plan.plan_width; - - /* - * Modify the passed in Agg plan according to the remote query we built. - * Materialization is always needed for RemoteQuery in case we need to restart - * the scan. - */ - agg_plan->plan.lefttree = (Plan *) make_material(agg_remote_plan); - - /* Indicate that we should apply collection function directly */ - agg_plan->skip_trans = true; - - return (Plan *)agg_plan; -} - -/* - * create_remotegroup_plan - * Given a Group plan, try to push as much of the query to the datanodes and - * build a Group plan to combiner the results across the datanodes. The Sort - * node under the Group plan is pushed down to RemoteQuery plan, since the - * combiner knows how to merge the results across datanodes in sorted manner. - * Hence there is no separate Sort node. - * - * This optimization is applied under following conditions - * 1. The scan plans under the Group->Sort node is RemoteQuery - * 2. There is not separate Sort, distinct, having clause in the query. - * - * PGXCTODO: we should lift up as many of these restrictions as possible or give - * reasons why those restrictions are needed. - */ -Plan * -create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) -{ - Group *group_plan; - Query *query = root->parse; Sort *sort_plan; RemoteQuery *remote_scan; /* remote query in the passed in plan */ RemoteQuery *remote_group; /* remote query after optimization */ Plan *remote_group_plan; /* plan portion of remote_group */ Plan *temp_plan; - List *local_tlist; /* target list of the local plan */ List *temp_vars; /* temporarily hold the VARs */ List *temp_vartlist; /* temporarity hold tlist of VARs */ ListCell *temp; @@ -5384,59 +5096,110 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) StringInfo orderby_clause = makeStringInfo(); /* remote query ORDER BY */ StringInfo remote_fromlist = makeStringInfo(); /* remote query FROM */ StringInfo in_alias = makeStringInfo(); - Relids in_relids; + Relids in_relids; /* the list of Relids referenced by lefttree */ Index dummy_rtindex; List *base_tlist; RangeTblEntry *dummy_rte; - int cntCols; + int numGroupCols; + AttrNumber *grpColIdx; - if (query->havingQual || + /* + * We don't push aggregation and grouping to datanodes, in case there are + * windowing aggregates, distinct, having clause or sort clauses. + */ + if (query->hasWindowFuncs || query->distinctClause || query->sortClause || - query->hasWindowFuncs) + query->havingQual) return local_plan; - /* For now only for Group plans are treated */ - Assert(IsA(local_plan, Group)); - group_plan = (Group *)local_plan; - remote_scan = NULL; - temp_plan = local_plan->lefttree; + /* + * PGXCTODO: we don't support the parameterised queries yet. So, for the + * time being we don't apply the optimizations for parameterised queries + */ + if (root->glob->boundParams) + return local_plan; + /* for now only Agg/Group plans */ + if (local_plan && IsA(local_plan, Agg)) + { + numGroupCols = ((Agg *)local_plan)->numCols; + grpColIdx = ((Agg *)local_plan)->grpColIdx; + } + else if (local_plan && IsA(local_plan, Group)) + { + numGroupCols = ((Group *)local_plan)->numCols; + grpColIdx = ((Group *)local_plan)->grpColIdx; + } + else + return local_plan; /* - * We expect plan tree as Group->Sort->{Result}?->{Material}?->RemoteQuery, + * We expect plan tree as Group/Agg->Sort->Result->Material->RemoteQuery, + * Result, Material nodes are optional. Sort is compulsory for Group but not + * for Agg. * anything else is not handled right now. */ - if (IsA(temp_plan, Sort)) + temp_plan = local_plan->lefttree; + remote_scan = NULL; + sort_plan = NULL; + if (temp_plan && IsA(temp_plan, Sort)) { sort_plan = (Sort *)temp_plan; temp_plan = temp_plan->lefttree; } - if (IsA(temp_plan, Result)) + if (temp_plan && IsA(temp_plan, Result)) temp_plan = temp_plan->lefttree; - if (IsA(temp_plan, Material)) + if (temp_plan && IsA(temp_plan, Material)) temp_plan = temp_plan->lefttree; - if (IsA(temp_plan, RemoteQuery)) + if (temp_plan && IsA(temp_plan, RemoteQuery)) remote_scan = (RemoteQuery *)temp_plan; - if (!remote_scan || !sort_plan) + if (!remote_scan) + return local_plan; + /* + * for Group plan we expect Sort under the Group, which is always the case, + * the condition below is really for some possible non-existent case + */ + if (IsA(local_plan, Group) && !sort_plan) return local_plan; - Assert(IsA(remote_scan, RemoteQuery)); - Assert(IsA(sort_plan, Sort)); /* - * grouping_planner will add Sort node before Group node to sort the rows + * Grouping_planner may add Sort node to sort the rows * based on the columns in GROUP BY clause. Hence the columns in Sort and * those in Group node in should be same. The columns are usually in the * same order in both nodes, hence check the equality in order. If this - * condition fails, we can not handle this GROUP plan for now. + * condition fails, we can not handle this plan for now. */ - if (sort_plan->numCols != group_plan->numCols) - return local_plan; - for (cntCols = 0; cntCols < group_plan->numCols; cntCols++) + if (sort_plan) { - if (sort_plan->sortColIdx[cntCols] != group_plan->grpColIdx[cntCols]) + int cntCols; + if (sort_plan->numCols != numGroupCols) return local_plan; + for (cntCols = 0; cntCols < numGroupCols; cntCols++) + { + if (sort_plan->sortColIdx[cntCols] != grpColIdx[cntCols]) + return local_plan; + } + } + + /* find all the relations referenced by targetlist of Grouping node */ + temp_vars = pull_var_clause((Node *)local_plan->targetlist, + PVC_REJECT_PLACEHOLDERS); + findReferencedVars(temp_vars, (Plan *)remote_scan, &temp_vartlist, &in_relids); + + /* + * process the targetlist of the grouping plan, also construct the + * targetlist of the query to be shipped to the remote side + */ + base_tlist = pgxc_process_grouping_targetlist(root, &(local_plan->targetlist)); + if (!base_tlist) + { + /* + * for some reason we can not construct a targetlist shippable to the + * datanode. Resort to the plan created by grouping_planner() + */ + return local_plan; } /* @@ -5444,16 +5207,11 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) * datanode. * 1. Create a remote query node reflecting the query to be pushed to the * datanode. - * 2. Modify the Group node passed in, to accept the results sent by the - * datanodes and group them. + * 2. Modify the Grouping node passed in, to accept the results sent by the + * Datanodes, then group and aggregate them, if needed. */ - local_tlist = local_plan->targetlist; appendStringInfo(in_alias, "%s_%d", "group", root->rs_alias_index); - /* Find all the relations referenced by targetlist of Group node */ - temp_vars = pull_var_clause((Node *)local_tlist, PVC_REJECT_PLACEHOLDERS); - findReferencedVars(temp_vars, (Plan *)remote_scan, &temp_vartlist, &in_relids); - /* * Build partial RemoteQuery node to be used for creating the Select clause * to be sent to the remote node. Rest of the node will be built later @@ -5471,13 +5229,12 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) remote_group->inner_relids = in_relids; remote_group->inner_statement = pstrdup(remote_scan->sql_statement); remote_group->exec_nodes = remote_scan->exec_nodes; - /* Don't forget to increment the index for the next time around! */ remote_group->reduce_level = root->rs_alias_index++; /* Generate the select clause of the remote query */ appendStringInfoString(remote_targetlist, "SELECT"); - foreach (temp, local_tlist) + foreach (temp, base_tlist) { TargetEntry *tle = lfirst(temp); Node *expr = (Node *)tle->expr; @@ -5503,32 +5260,47 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) */ if (query->groupClause) { - SimpleSort *remote_sort = makeNode(SimpleSort); - char *sep = ""; + int cntCols; + char *sep; /* - * Reuse the arrays allocated in sort_plan to create SimpleSort - * structure. sort_plan is useless henceforth. + * recompute the column ids of the grouping columns, + * the group column indexes computed earlier point in the + * targetlists of the scan plans under this node. But now the grouping + * column indexes will be pointing in the targetlist of the new + * RemoteQuery, hence those need to be recomputed */ - remote_sort->numCols = group_plan->numCols; - remote_sort->sortColIdx = sort_plan->sortColIdx; - remote_sort->sortOperators = sort_plan->sortOperators; - remote_sort->nullsFirst = sort_plan->nullsFirst; - - pgxc_locate_grouping_columns(root, local_tlist, group_plan->grpColIdx); + pgxc_locate_grouping_columns(root, base_tlist, grpColIdx); appendStringInfoString(groupby_clause, "GROUP BY "); - appendStringInfoString(orderby_clause, "ORDER BY "); - for (cntCols = 0; cntCols < group_plan->numCols; cntCols++) + sep = ""; + for (cntCols = 0; cntCols < numGroupCols; cntCols++) { - appendStringInfo(groupby_clause, "%s%d", sep, - group_plan->grpColIdx[cntCols]); - remote_sort->sortColIdx[cntCols] = group_plan->grpColIdx[cntCols]; - appendStringInfo(orderby_clause, "%s%d", sep, - remote_sort->sortColIdx[cntCols]); + appendStringInfo(groupby_clause, "%s%d", sep, grpColIdx[cntCols]); sep = ", "; } - remote_group->sort = remote_sort; + if (sort_plan) + { + SimpleSort *remote_sort = makeNode(SimpleSort); + /* + * reuse the arrays allocated in sort_plan to create SimpleSort + * structure. sort_plan is useless henceforth. + */ + remote_sort->numCols = sort_plan->numCols; + remote_sort->sortColIdx = sort_plan->sortColIdx; + remote_sort->sortOperators = sort_plan->sortOperators; + remote_sort->nullsFirst = sort_plan->nullsFirst; + appendStringInfoString(orderby_clause, "ORDER BY "); + sep = ""; + for (cntCols = 0; cntCols < remote_sort->numCols; cntCols++) + { + remote_sort->sortColIdx[cntCols] = grpColIdx[cntCols]; + appendStringInfo(orderby_clause, "%s%d", sep, + remote_sort->sortColIdx[cntCols]); + sep = ", "; + } + remote_group->sort = remote_sort; + } } /* Generate the remote sql statement from the pieces */ @@ -5537,20 +5309,13 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) orderby_clause->data); /* - * Set the base_tlist for the RemoteQuery node being created, it's used to - * create the tuple descriptor for the result from RemoteQuery and rewrite - * the Aggregates targetlist accept the results of the RemoteQuery. - */ - base_tlist = add_to_flat_tlist(NIL, get_tlist_exprs(local_tlist, true)); - - /* * Create a dummy RTE for the remote query being created. Append the dummy * range table entry to the range table. Note that this modifies the master * copy the caller passed us, otherwise e.g EXPLAIN VERBOSE will fail to - * find the rte the Vars built below refer to. + * find the rte the Vars built below refer to. Also create the tuple + * descriptor for the result of this query from the base_tlist (targetlist + * we used to generate the remote node query). */ - - /* Cook up the reltupdesc using this base_tlist */ dummy_rte = makeNode(RangeTblEntry); dummy_rte->reltupdesc = ExecTypeFromTL(base_tlist, false); dummy_rte->rtekind = RTE_RELATION; @@ -5577,6 +5342,7 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) remote_group->base_tlist = base_tlist; remote_group->relname = "__FOREIGN_QUERY__"; remote_group->partitioned_replicated = remote_scan->partitioned_replicated; + remote_group->read_only = query->commandType == CMD_SELECT; /* * Only quals that can be pushed to the remote side are the ones in the having @@ -5587,21 +5353,24 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) * node, may have local quals. In such case, we have to aggregate and group * at coordinator and can not push the grouping clause to the datanodes. Is * there a case in XC, where we can have local quals? + * we actually need not worry about costs since this is the final plan */ - - /* We actually do not need to worry about costs since this is the final plan */ remote_group_plan->startup_cost = remote_scan->scan.plan.startup_cost; remote_group_plan->total_cost = remote_scan->scan.plan.total_cost; remote_group_plan->plan_rows = remote_scan->scan.plan.plan_rows; remote_group_plan->plan_width = remote_scan->scan.plan.plan_width; /* - * Modify the passed in Group plan according to the remote query we built. - * Materialization is always need for RemoteQuery in case we need to restart + * Modify the passed in grouping plan according to the remote query we built + * Materialization is always needed for RemoteQuery in case we need to restart * the scan. */ - group_plan->plan.lefttree = (Plan *) make_material(remote_group_plan); - return (Plan *)group_plan; + local_plan->lefttree = (Plan *) make_material(remote_group_plan); + /* indicate that we should apply collection function directly */ + if (IsA(local_plan, Agg)) + ((Agg *)local_plan)->skip_trans = true; + + return local_plan; } /* @@ -5638,4 +5407,127 @@ pgxc_locate_grouping_columns(PlannerInfo *root, List *tlist, groupColIdx[keyno++] = te->resno; } } + +/* + * pgxc_process_grouping_targetlist + * The function scans the targetlist to check if the we can push anything + * from the targetlist to the datanode. Following rules govern the choice + * 1. Either all of the aggregates are pushed to the datanode or none is pushed + * 2. If there are no aggregates, the targetlist is good to be shipped as is + * 3. If aggregates are involved in expressions, we push the aggregates to the + * datanodes but not the involving expressions. + * + * The function constructs the targetlist for the query to be pushed to the + * datanode. It modifies the local targetlist to point to the expressions in + * remote targetlist wherever necessary (e.g. aggregates) + * + * PGXCTODO: we should be careful while pushing the function expressions, it's + * better to push functions like strlen() which can be evaluated at the + * datanode, but we should avoid pushing functions which can only be evaluated + * at coordinator. + */ +static List * +pgxc_process_grouping_targetlist(PlannerInfo *root, List **local_tlist) +{ + bool shippable_remote_tlist = true; + List *remote_tlist = NIL; + int next_resno = 1; /* resno start from 1 */ + List *orig_local_tlist = NIL;/* Copy original local_tlist, in case it changes */ + ListCell *temp; + Query *query = root->parse; + + /* + * Walk through the target list and find out whether we can push the + * aggregates and grouping to datanodes. We can do so if the target list + * contains plain aggregates (without any expression involving those) and + * expressions in group by clauses only (last one to make the query legit. + */ + foreach(temp, *local_tlist) + { + TargetEntry *local_tle = lfirst(temp); + TargetEntry *remote_tle; + Node *expr = (Node *)local_tle->expr; + + if (IsA(expr, Aggref)) + { + Aggref *aggref = (Aggref *)expr; + if (aggref->aggorder || aggref->aggdistinct || aggref->agglevelsup) + { + shippable_remote_tlist = false; + break; + } + } + else if (query->hasAggs && checkExprHasAggs(expr)) + { + /* + * Targetlist expressions which have aggregates embedded inside + * are not handled right now. + * PGXCTODO: We should be able to extract those aggregates out. + * Add those to remote targetlist and modify the local + * targetlist accordingly. Thus we get those aggregates grouped + * and "transitioned" at the datanode. + */ + shippable_remote_tlist = false; + break; + } + + remote_tle = makeTargetEntry(copyObject(expr), + next_resno++, + NULL, + false); + /* Copy GROUP BY/SORT BY reference for the locating group by columns */ + remote_tle->ressortgroupref = local_tle->ressortgroupref; + remote_tlist = lappend(remote_tlist, remote_tle); + + /* + * Replace the args of the local Aggref with Aggref node to be + * included in RemoteQuery node, so that set_plan_refs can convert + * the args into VAR pointing to the appropriate result in the tuple + * coming from RemoteQuery node + * PGXCTODO: should we push this change in targetlists of plans + * above? + */ + if (IsA(expr, Aggref)) + { + Aggref *local_aggref = (Aggref *)expr; + Aggref *remote_aggref = (Aggref *)remote_tle->expr; + Assert(IsA(remote_tle->expr, Aggref)); + remote_aggref->aggtype = remote_aggref->aggtrantype; + /* + * We are about to change the local_tlist, check if we have already + * copied original local_tlist, if not take a copy + */ + if (!orig_local_tlist) + orig_local_tlist = copyObject(*local_tlist); + /* Is copyObject() needed here? probably yes */ + local_aggref->args = list_make1(makeTargetEntry(copyObject(remote_tle->expr), + 1, NULL, + false)); + } + } + + if (!shippable_remote_tlist) + { + /* + * If local_tlist has changed but we didn't find anything shippable to + * datanode, we need to restore the local_tlist to original state, + */ + if (orig_local_tlist) + *local_tlist = orig_local_tlist; + if (remote_tlist) + list_free_deep(remote_tlist); + remote_tlist = NIL; + } + else if (orig_local_tlist) + { + /* + * If we have changed the targetlist passed, we need to pass back the + * changed targetlist. Free the copy that has been created. + */ + list_free_deep(orig_local_tlist); + } + + return remote_tlist; +} + #endif diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index e64e938..5250905 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1334,16 +1334,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) numGroups, agg_counts.numAggs, result_plan); -#ifdef PGXC - /* - * Grouping will certainly not increase the number of rows - * coordinator fetches from datanode, in fact it's expected to - * reduce the number drastically. Hence, try pushing GROUP BY - * clauses and aggregates to the datanode, thus saving bandwidth. - */ - if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - result_plan = create_remoteagg_plan(root, result_plan); -#endif /* PGXC */ /* Hashed aggregation produces randomly-ordered results */ current_pathkeys = NIL; } @@ -1415,16 +1405,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) extract_grouping_ops(parse->groupClause), dNumGroups, result_plan); -#ifdef PGXC - /* - * Grouping will certainly not increase the number of rows - * coordinator fetches from datanode, in fact it's expected to - * reduce the number drastically. Hence, try pushing GROUP BY - * clauses and aggregates to the datanode, thus saving bandwidth. - */ - if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - result_plan = create_remotegroup_plan(root, result_plan); -#endif /* PGXC */ } else if (root->hasHavingQual) { @@ -1445,6 +1425,17 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) parse->havingQual, NULL); } +#ifdef PGXC + /* + * Grouping will certainly not increase the number of rows + * coordinator fetches from datanode, in fact it's expected to + * reduce the number drastically. Hence, try pushing GROUP BY + * clauses and aggregates to the datanode, thus saving bandwidth. + */ + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + result_plan = create_remotegrouping_plan(root, result_plan); +#endif /* PGXC */ + } /* end of non-minmax-aggregate case */ /* diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 8005d47..e1662ea 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -137,8 +137,7 @@ extern Var *search_tlist_for_var(Var *var, List *jtlist); extern Plan *create_remoteinsert_plan(PlannerInfo *root, Plan *topplan); extern Plan *create_remoteupdate_plan(PlannerInfo *root, Plan *topplan); extern Plan *create_remotedelete_plan(PlannerInfo *root, Plan *topplan); -extern Plan *create_remotegroup_plan(PlannerInfo *root, Plan *local_plan); -extern Plan *create_remoteagg_plan(PlannerInfo *root, Plan *agg_plan); +extern Plan *create_remotegrouping_plan(PlannerInfo *root, Plan *local_plan); #endif #endif /* PLANMAIN_H */ diff --git a/src/test/regress/expected/xc_groupby.out b/src/test/regress/expected/xc_groupby.out index e403e37..31e9a8c 100644 --- a/src/test/regress/expected/xc_groupby.out +++ b/src/test/regress/expected/xc_groupby.out @@ -1,3 +1,10 @@ +-- this file contains tests for GROUP BY with combinations of following +-- 1. enable_hashagg = on/off (to force the grouping by sorting) +-- 2. distributed or replicated tables across the datanodes +-- If a testcase is added to any of the combinations, please check if it's +-- applicable in other combinations as well. +-- Combination 1: enable_hashagg on and distributed tables +set enable_hashagg to on; -- create required tables and fill them with data create table tab1 (val int, val2 int); create table tab2 (val int, val2 int); @@ -11,6 +18,17 @@ select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from tab1 g 3 | 11 | 3.6666666666666667 | 3.66666666666667 | 3 (3 rows) +explain verbose select count(*), sum(val), avg(val), sum(val)::float8/count(*), val2 from tab1 group by val2; + QUERY PLAN +------------------------------------------------------------------------------------------------------------- + HashAggregate (cost=1.03..1.06 rows=1 width=8) + Output: count(*), sum(val), avg(val), ((sum(val))::double precision / (count(*))::double precision), val2 + -> Materialize (cost=0.00..1.01 rows=1 width=8) + Output: val, val2 + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=8) + Output: val, val2 +(6 rows) + -- joins and group by select count(*), sum(tab1.val * tab2.val), avg(tab1.val*tab2.val), sum(tab1.val*tab2.val)::float8/count(*), tab1.val2, tab2.val2 from tab1 full outer join tab2 on tab1.val2 = tab2.val2 group by tab1.val2, tab2.val2; count | sum | avg | ?column? | val2 | val2 @@ -34,14 +52,14 @@ explain verbose select count(*), sum(tab1.val * tab2.val), avg(tab1.val*tab2.val Sort Key: tab1.val2 -> Materialize (cost=0.00..1.01 rows=1 width=8) Output: tab1.val, tab1.val2 - -> Data Node Scan (Node Count [2]) "SELECT val, val2 FROM public.tab1 tab1" on tab1 (cost=0.00..1.01 rows=1000 width=8) + -> Data Node Scan (Node Count [2]) on tab1 (cost=0.00..1.01 rows=1000 width=8) Output: tab1.val, tab1.val2 -> Sort (cost=1.02..1.03 rows=1 width=8) Output: tab2.val, tab2.val2 Sort Key: tab2.val2 -> Materialize (cost=0.00..1.01 rows=1 width=8) Output: tab2.val, tab2.val2 - -> Data Node Scan (Node Count [2]) "SELECT val, val2 FROM public.tab2 tab2" on tab2 (cost=0.00..1.01 rows=1000 width=8) + -> Data Node Scan (Node Count [2]) on tab2 (cost=0.00..1.01 rows=1000 width=8) Output: tab2.val, tab2.val2 (19 rows) @@ -57,17 +75,16 @@ explain verbose select sum(y) from (select sum(val) y, val2%2 x from tab1 group QUERY PLAN ---------------------------------------------------------------------------------------- HashAggregate (cost=1.05..1.06 rows=1 width=12) - Output: sum((sum(tab1.val))), ((tab1.val2 % 2)) + Output: sum((pg_catalog.sum((sum(tab1.val))))), ((tab1.val2 % 2)) -> HashAggregate (cost=1.02..1.03 rows=1 width=8) - Output: sum(tab1.val), (tab1.val2 % 2), tab1.val2 - -> Materialize (cost=0.00..1.01 rows=1 width=8) - Output: tab1.val, tab1.val2 + Output: pg_catalog.sum((sum(tab1.val))), ((tab1.val2 % 2)), tab1.val2 + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (sum(tab1.val)), ((tab1.val2 % 2)), tab1.val2 -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=8) - Output: tab1.val, tab1.val2 + Output: sum(tab1.val), (tab1.val2 % 2), tab1.val2 (8 rows) -- group by without aggregate -set enable_hashagg to off; select val2 from tab1 group by val2; val2 ------ @@ -79,7 +96,7 @@ select val2 from tab1 group by val2; explain verbose select val2 from tab1 group by val2; QUERY PLAN ---------------------------------------------------------------------------------- - Group (cost=1.02..1.03 rows=1 width=4) + HashAggregate (cost=1.02..1.03 rows=1 width=4) Output: tab1.val2 -> Materialize (cost=0.00..0.00 rows=0 width=0) Output: tab1.val2 @@ -90,18 +107,18 @@ explain verbose select val2 from tab1 group by val2; select val + val2 from tab1 group by val + val2; ?column? ---------- - 2 - 3 4 7 + 3 8 9 + 2 (6 rows) explain verbose select val + val2 from tab1 group by val + val2; QUERY PLAN ---------------------------------------------------------------------------------- - Group (cost=1.03..1.04 rows=1 width=8) + HashAggregate (cost=1.02..1.03 rows=1 width=8) Output: ((tab1.val + tab1.val2)) -> Materialize (cost=0.00..0.00 rows=0 width=0) Output: ((tab1.val + tab1.val2)) @@ -112,12 +129,12 @@ explain verbose select val + val2 from tab1 group by val + val2; select val + val2, val, val2 from tab1 group by val, val2; ?column? | val | val2 ----------+-----+------ - 2 | 1 | 1 + 7 | 4 | 3 + 4 | 3 | 1 4 | 1 | 3 - 3 | 2 | 1 4 | 2 | 2 - 4 | 3 | 1 - 7 | 4 | 3 + 3 | 2 | 1 + 2 | 1 | 1 8 | 6 | 2 9 | 6 | 3 (8 rows) @@ -125,7 +142,7 @@ select val + val2, val, val2 from tab1 group by val, val2; explain verbose select val + val2, val, val2 from tab1 group by val, val2; QUERY PLAN ---------------------------------------------------------------------------------- - Group (cost=1.02..1.04 rows=1 width=8) + HashAggregate (cost=1.02..1.03 rows=1 width=8) Output: ((tab1.val + tab1.val2)), tab1.val, tab1.val2 -> Materialize (cost=0.00..0.00 rows=0 width=0) Output: ((tab1.val + tab1.val2)), tab1.val, tab1.val2 @@ -136,18 +153,18 @@ explain verbose select val + val2, val, val2 from tab1 group by val, val2; select tab1.val + tab2.val2, tab1.val, tab2.val2 from tab1, tab2 where tab1.val = tab2.val group by tab1.val, tab2.val2; ?column? | val | val2 ----------+-----+------ - 2 | 1 | 1 - 6 | 2 | 4 5 | 3 | 2 - 7 | 3 | 4 5 | 4 | 1 6 | 4 | 2 + 6 | 2 | 4 + 2 | 1 | 1 + 7 | 3 | 4 (6 rows) explain verbose select tab1.val + tab2.val2, tab1.val, tab2.val2 from tab1, tab2 where tab1.val = tab2.val group by tab1.val, tab2.val2; QUERY PLAN ------------------------------------------------------------------------------- - Group (cost=0.01..0.02 rows=1 width=0) + HashAggregate (cost=0.00..0.01 rows=1 width=0) Output: ((tab1.val + tab2.val2)), tab1.val, tab2.val2 -> Materialize (cost=0.00..0.00 rows=0 width=0) Output: ((tab1.val + tab2.val2)), tab1.val, tab2.val2 @@ -158,16 +175,16 @@ explain verbose select tab1.val + tab2.val2, tab1.val, tab2.val2 from tab1, tab2 select tab1.val + tab2.val2 from tab1, tab2 where tab1.val = tab2.val group by tab1.val + tab2.val2; ?column? ---------- - 2 - 5 6 + 2 7 + 5 (4 rows) explain verbose select tab1.val + tab2.val2 from tab1, tab2 where tab1.val = tab2.val group by tab1.val + tab2.val2; QUERY PLAN ------------------------------------------------------------------------------- - Group (cost=0.01..0.02 rows=1 width=0) + HashAggregate (cost=0.00..0.01 rows=1 width=0) Output: ((tab1.val + tab2.val2)) -> Materialize (cost=0.00..0.00 rows=0 width=0) Output: ((tab1.val + tab2.val2)) @@ -175,7 +192,6 @@ explain verbose select tab1.val + tab2.val2 from tab1, tab2 where tab1.val = tab Output: (tab1.val + tab2.val2) (6 rows) -reset enable_hashagg; -- group by with aggregates in expression select count(*) + sum(val) + avg(val), val2 from tab1 group by val2; ?column? | val2 @@ -206,20 +222,366 @@ select sum(val), avg(val), 2 * val2 from tab1 group by 2 * val2; (3 rows) explain verbose select sum(val), avg(val), 2 * val2 from tab1 group by 2 * val2; - QUERY PLAN ----------------------------------------------------------------------------------------- + QUERY PLAN +----------------------------------------------------------------------------------------------- HashAggregate (cost=1.02..1.04 rows=1 width=8) - Output: sum(val), avg(val), ((2 * val2)) - -> Result (cost=0.00..1.02 rows=1 width=8) - Output: val, val2, (2 * val2) - -> Materialize (cost=0.00..1.01 rows=1 width=8) - Output: val, val2 - -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=8) - Output: val, val2 -(8 rows) + Output: pg_catalog.sum((sum(tab1.val))), pg_catalog.avg((avg(tab1.val))), ((2 * tab1.val2)) + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (sum(tab1.val)), (avg(tab1.val)), ((2 * tab1.val2)) + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=8) + Output: sum(tab1.val), avg(tab1.val), (2 * tab1.val2) +(6 rows) drop table tab1; drop table tab2; +-- some tests involving nulls, characters, float type etc. +create table def(a int, b varchar(25)); +insert into def VALUES (NULL, NULL); +insert into def VALUES (1, NULL); +insert into def VALUES (NULL, 'One'); +insert into def VALUES (2, 'Two'); +insert into def VALUES (2, 'Two'); +insert into def VALUES (3, 'Three'); +insert into def VALUES (4, 'Three'); +insert into def VALUES (5, 'Three'); +insert into def VALUES (6, 'Two'); +insert into def VALUES (7, NULL); +insert into def VALUES (8, 'Two'); +insert into def VALUES (9, 'Three'); +insert into def VALUES (10, 'Three'); +select a,count(a) from def group by a order by a; + a | count +----+------- + 1 | 1 + 2 | 2 + 3 | 1 + 4 | 1 + 5 | 1 + 6 | 1 + 7 | 1 + 8 | 1 + 9 | 1 + 10 | 1 + | 0 +(11 rows) + +explain verbose select a,count(a) from def group by a order by a; + QUERY PLAN +---------------------------------------------------------------------------------------------- + GroupAggregate (cost=1.02..1.05 rows=1 width=4) + Output: a, count(a) + -> Sort (cost=1.02..1.03 rows=1 width=4) + Output: a + Sort Key: def.a + -> Result (cost=0.00..1.01 rows=1 width=4) + Output: a + -> Materialize (cost=0.00..1.01 rows=1 width=4) + Output: a, b + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=4) + Output: a, b +(11 rows) + +select avg(a) from def group by a; + avg +------------------------ + + 6.0000000000000000 + 8.0000000000000000 + 5.0000000000000000 + 2.0000000000000000 + 1.00000000000000000000 + 9.0000000000000000 + 3.0000000000000000 + 10.0000000000000000 + 7.0000000000000000 + 4.0000000000000000 +(11 rows) + +select avg(a) from def group by a; + avg +------------------------ + + 6.0000000000000000 + 8.0000000000000000 + 5.0000000000000000 + 1.00000000000000000000 + 9.0000000000000000 + 2.0000000000000000 + 10.0000000000000000 + 7.0000000000000000 + 3.0000000000000000 + 4.0000000000000000 +(11 rows) + +explain verbose select avg(a) from def group by a; + QUERY PLAN +---------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=4) + Output: pg_catalog.avg((avg(def.a))), def.a + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (avg(def.a)), def.a + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=4) + Output: avg(def.a), def.a +(6 rows) + +select avg(a) from def group by b; + avg +-------------------- + 4.0000000000000000 + + 4.5000000000000000 + 6.2000000000000000 +(4 rows) + +explain verbose select avg(a) from def group by b; + QUERY PLAN +----------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=33) + Output: pg_catalog.avg((avg(def.a))), def.b + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (avg(def.a)), def.b + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=33) + Output: avg(def.a), def.b +(6 rows) + +select sum(a) from def group by b; + sum +----- + 8 + + 18 + 31 +(4 rows) + +explain verbose select sum(a) from def group by b; + QUERY PLAN +----------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=33) + Output: pg_catalog.sum((sum(def.a))), def.b + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (sum(def.a)), def.b + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=33) + Output: sum(def.a), def.b +(6 rows) + +select count(*) from def group by b; + count +------- + 3 + 1 + 4 + 5 +(4 rows) + +explain verbose select count(*) from def group by b; + QUERY PLAN +----------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=29) + Output: pg_catalog.count(*), def.b + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (count(*)), def.b + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=29) + Output: count(*), def.b +(6 rows) + +select count(*) from def where a is not null group by a; + count +------- + 1 + 1 + 1 + 1 + 1 + 1 + 2 + 1 + 1 + 1 +(10 rows) + +explain verbose select count(*) from def where a is not null group by a; + QUERY PLAN +---------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=4) + Output: pg_catalog.count(*), def.a + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (count(*)), def.a + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=4) + Output: count(*), def.a +(6 rows) + +select b from def group by b; + b +------- + + One + Two + Three +(4 rows) + +explain verbose select b from def group by b; + QUERY PLAN +----------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=29) + Output: def.b + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: def.b + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=29) + Output: def.b +(6 rows) + +select b,count(b) from def group by b; + b | count +-------+------- + | 0 + One | 1 + Two | 4 + Three | 5 +(4 rows) + +explain verbose select b,count(b) from def group by b; + QUERY PLAN +----------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=29) + Output: def.b, count((count(def.b))) + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: def.b, (count(def.b)) + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=29) + Output: def.b, count(def.b) +(6 rows) + +select count(*) from def where b is null group by b; + count +------- + 3 +(1 row) + +explain verbose select count(*) from def where b is null group by b; + QUERY PLAN +----------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=29) + Output: pg_catalog.count(*), def.b + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (count(*)), def.b + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=29) + Output: count(*), def.b +(6 rows) + +create table g(a int, b float, c numeric); +insert into g values(1,2.1,3.2); +insert into g values(1,2.1,3.2); +insert into g values(2,2.3,5.2); +select sum(a) from g group by a; + sum +----- + 2 + 2 +(2 rows) + +explain verbose select sum(a) from g group by a; + QUERY PLAN +---------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=4) + Output: pg_catalog.sum((sum(g.a))), g.a + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (sum(g.a)), g.a + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=4) + Output: sum(g.a), g.a +(6 rows) + +select sum(b) from g group by b; + sum +----- + 2.3 + 4.2 +(2 rows) + +explain verbose select sum(b) from g group by b; + QUERY PLAN +---------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=8) + Output: sum((sum(g.b))), g.b + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (sum(g.b)), g.b + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=8) + Output: sum(g.b), g.b +(6 rows) + +select sum(c) from g group by b; + sum +----- + 5.2 + 6.4 +(2 rows) + +explain verbose select sum(c) from g group by b; + QUERY PLAN +----------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=40) + Output: sum((sum(g.c))), g.b + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (sum(g.c)), g.b + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=40) + Output: sum(g.c), g.b +(6 rows) + +select avg(a) from g group by b; + avg +------------------------ + 2.0000000000000000 + 1.00000000000000000000 +(2 rows) + +explain verbose select avg(a) from g group by b; + QUERY PLAN +----------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=12) + Output: pg_catalog.avg((avg(g.a))), g.b + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (avg(g.a)), g.b + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=12) + Output: avg(g.a), g.b +(6 rows) + +select avg(b) from g group by c; + avg +----- + 2.3 + 2.1 +(2 rows) + +explain verbose select avg(b) from g group by c; + QUERY PLAN +----------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=40) + Output: pg_catalog.avg((avg(g.b))), g.c + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (avg(g.b)), g.c + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=40) + Output: avg(g.b), g.c +(6 rows) + +select avg(c) from g group by c; + avg +-------------------- + 5.2000000000000000 + 3.2000000000000000 +(2 rows) + +explain verbose select avg(c) from g group by c; + QUERY PLAN +----------------------------------------------------------------------------------- + HashAggregate (cost=1.02..1.03 rows=1 width=32) + Output: pg_catalog.avg((avg(g.c))), g.c + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (avg(g.c)), g.c + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=32) + Output: avg(g.c), g.c +(6 rows) + +drop table def; +drop table g; +-- Combination 2, enable_hashagg on and replicated tables. -- repeat the same tests for replicated tables -- create required tables and fill them with data create table tab1 (val int, val2 int) distribute by replication; @@ -268,14 +630,14 @@ explain verbose select count(*), sum(tab1.val * tab2.val), avg(tab1.val*tab2.val Sort Key: tab1.val2 -> Materialize (cost=0.00..1.01 rows=1 width=8) Output: tab1.val, tab1.val2 - -> Data Node Scan (Node Count [1]) "SELECT val, val2 FROM public.tab1 tab1" on tab1 (cost=0.00..1.01 rows=1000 width=8) + -> Data Node Scan (Node Count [1]) on tab1 (cost=0.00..1.01 rows=1000 width=8) Output: tab1.val, tab1.val2 -> Sort (cost=1.02..1.03 rows=1 width=8) Output: tab2.val, tab2.val2 Sort Key: tab2.val2 -> Materialize (cost=0.00..1.01 rows=1 width=8) Output: tab2.val, tab2.val2 - -> Data Node Scan (Node Count [1]) "SELECT val, val2 FROM public.tab2 tab2" on tab2 (cost=0.00..1.01 rows=1000 width=8) + -> Data Node Scan (Node Count [1]) on tab2 (cost=0.00..1.01 rows=1000 width=8) Output: tab2.val, tab2.val2 (19 rows) @@ -291,17 +653,16 @@ explain verbose select sum(y) from (select sum(val) y, val2%2 x from tab1 group QUERY PLAN ---------------------------------------------------------------------------------------- HashAggregate (cost=1.05..1.06 rows=1 width=12) - Output: sum((sum(tab1.val))), ((tab1.val2 % 2)) + Output: sum((pg_catalog.sum((sum(tab1.val))))), ((tab1.val2 % 2)) -> HashAggregate (cost=1.02..1.03 rows=1 width=8) - Output: sum(tab1.val), (tab1.val2 % 2), tab1.val2 - -> Materialize (cost=0.00..1.01 rows=1 width=8) - Output: tab1.val, tab1.val2 + Output: pg_catalog.sum((sum(tab1.val))), ((tab1.val2 % 2)), tab1.val2 + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: (sum(tab1.val)), ((tab1.val2 % 2)), tab1.val2 -> Data Node Scan (Node Count [1]) (cost=0.00..1.01 rows=1000 width=8) - Output: tab1.val, tab1.val2 + Output: sum(tab1.val), (tab1.val2 % 2), tab1.val2 (8 rows) -- group by without aggregate -set enable_hashagg to off; select val2 from tab1 group by val2; val2 ------ @@ -313,7 +674,7 @@ select val2 from tab1 group by val2; explain verbose select val2 from tab1 group by val2; QUERY PLAN ---------------------------------------------------------------------------------- - Group (cost=1.02..1.03 rows=1 width=4) + HashAggregate (cost=1.02..1.03 rows=1 width=4) Output: tab1.val2 -> Materialize (cost=0.00..0.00 rows=0 width=0) Output: tab1.val2 @@ -324,18 +685,18 @@ explain verbose select val2 from tab1 group by val2; select val + val2 from tab1 group by val + val2; ?column? ---------- - 2 - 3 4 + 3 7 8 9 + 2 (6 rows) explain verbose select val + val2 from tab1 group by val + val2; QUERY PLAN ---------------------------------------------------------------------------------- - Group (cost=1.03..1.04 rows=1 width=8) + HashAggregate (cost=1.02..1.03 rows=1 width=8) Output: ((tab1.val + tab1.val2)) -> Materialize (cost=0.00..0.00 rows=0 width=0) Output: ((tab1.val + tab1.val2)) @@ -346,12 +707,12 @@ explain verbose select val + val2 from tab1 group by val + val2; select val + val2, val, val2 from tab1 group by val, val2; ?column? | val | val2 ----------+-----+------ - 2 | 1 | 1 + 7 | 4 | 3 + 4 | 3 | 1 + 4 | 2 | 2 4 | 1 | 3 3 | 2 | 1 - 4 | 2 | 2 - 4 | 3 | 1 - 7 | 4 | 3 + 2 | 1 | 1 8 | 6 | 2 9 | 6 | 3 (8 rows) @@ -359,7 +720,7 @@ select val + val2, val, val2 from tab1 group by val, val2; explain verbose select val + val2, val, val2 from tab1 group by val, val2; QUERY PLAN ---------------------------------------------------------------------------------- - Group (cost=1.02..1.04 rows=1 width=8) + HashAggregate (cost=1.02..1.03 rows=1 width=8) Output: ((tab1.val + tab1.val2)), tab1.val, tab1.val2 -> Materialize (cost=0.00..0.00 rows=0 width=0) Output: ((tab1.val + tab1.val2)), tab1.val, tab1.val2 @@ -370,18 +731,18 @@ explain verbose select val + val2, val, val2 from tab1 group by val, val2; select tab1.val + tab2.val2, tab1.val, tab2.val2 from tab1, tab2 where tab1.val = tab2.val group by tab1.val, tab2.val2; ?column? | val | val2 ----------+-----+------ - 2 | 1 | 1 - 6 | 2 | 4 5 | 3 | 2 - 7 | 3 | 4 5 | 4 | 1 6 | 4 | 2 + 2 | 1 | 1 + 6 | 2 | 4 + 7 | 3 | 4 (6 rows) explain verbose select tab1.val + tab2.val2, tab1.val, tab2.val2 from tab1, tab2 where tab1.val = tab2.val group by tab1.val, tab2.val2; QUERY PLAN ------------------------------------------------------------------------------- - Group (cost=0.01..0.02 rows=1 width=0) + HashAggregate (cost=0.00..0.01 rows=1 width=0) Output: ((tab1.val + tab2.val2)), tab1.val, tab2.val2 -> Materialize (cost=0.00..0.00 rows=0 width=0) Output: ((tab1.val + tab2.val2)), tab1.val, tab2.val2 @@ -392,16 +753,16 @@ explain verbose select tab1.val + tab2.val2, tab1.val, tab2.val2 from tab1, tab2 select tab1.val + tab2.val2 from tab1, tab2 where tab1.val = tab2.val group by tab1.val + tab2.val2; ?column? ---------- - 2 - 5 6 + 2 7 + 5 (4 rows) explai... [truncated message content] |
From: Michael P. <mic...@us...> - 2011-06-09 23:24:14
|
Project "Postgres-XC". The branch, PGXC-TrialMaster has been deleted was 1e3498c1a95c36b4bed96287d4209c0ee049db9d ----------------------------------------------------------------------- 1e3498c1a95c36b4bed96287d4209c0ee049db9d Partial fix for bug 3310399: Autovacuum workers using same connections to GTM ----------------------------------------------------------------------- hooks/post-receive -- Postgres-XC |
From: Michael P. <mic...@us...> - 2011-06-09 23:22:57
|
Project "Postgres-XC". The branch, master has been created at 1e3498c1a95c36b4bed96287d4209c0ee049db9d (commit) - Log ----------------------------------------------------------------- ----------------------------------------------------------------------- hooks/post-receive -- Postgres-XC |
From: Michael P. <mic...@us...> - 2011-06-09 23:21:54
|
Project "Postgres-XC". The branch, master has been deleted was b47dc806218c3746ff5ab233de6d6728b6f0457d ----------------------------------------------------------------------- b47dc806218c3746ff5ab233de6d6728b6f0457d Partial fix for bug 3310399: Autovacuum workers using same connections to GTM ----------------------------------------------------------------------- hooks/post-receive -- Postgres-XC |
From: Michael P. <mic...@us...> - 2011-06-09 23:15:32
|
Project "Postgres-XC". The branch, POSTGRES9_0_3_BASE has been created at b47dc806218c3746ff5ab233de6d6728b6f0457d (commit) - Log ----------------------------------------------------------------- ----------------------------------------------------------------------- hooks/post-receive -- Postgres-XC |
From: Koichi S. <koi...@us...> - 2011-06-07 06:27:23
|
Project "Postgres-XC". The branch, ha_support has been updated via 4b9d49a1ed74299bf09c5c59b49bd246b4cbb214 (commit) from ffcc0f293e55381f57bee8e47ec0b332f518e1e1 (commit) - Log ----------------------------------------------------------------- commit 4b9d49a1ed74299bf09c5c59b49bd246b4cbb214 Author: Koichi Suzuki <koi...@gm...> Date: Tue Jun 7 15:26:47 2011 +0900 This commit fixes potential problem of GTM reconnect operation. The patch resets file descriptor's event flag so that the file will not be read after the reconnect. Modified file is: modified: gtm/proxy/proxy_main.c diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index 4b9e122..1c50c60 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -1251,6 +1251,8 @@ GTMProxy_ThreadMain(void *argp) */ qtype = ReadCommand(thrinfo->thr_conn, &input_message); + thrinfo->thr_poll_fds[ii].revents = 0; + switch(qtype) { case 'C': ----------------------------------------------------------------------- Summary of changes: src/gtm/proxy/proxy_main.c | 2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) hooks/post-receive -- Postgres-XC |
From: Michael P. <mic...@us...> - 2011-06-06 07:34:14
|
Project "Postgres-XC". The branch, pgxc-barrier has been updated via 3b00d08bae2d2b48e957c61d9714a749d493a7ec (commit) from 467b6dc1d5af91bcabbb2b17f640b4cf89e112f9 (commit) - Log ----------------------------------------------------------------- commit 3b00d08bae2d2b48e957c61d9714a749d493a7ec Author: Michael P <mic...@us...> Date: Mon Jun 6 16:35:26 2011 +0900 Correction of spelling mistakes Addition of a couple of compilation flags forgotten. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 20489ce..b7510b0 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -39,7 +39,9 @@ #include "funcapi.h" #include "libpq/pqsignal.h" #include "miscadmin.h" +#ifdef PGXC #include "pgxc/barrier.h" +#endif #include "pgstat.h" #include "postmaster/bgwriter.h" #include "replication/walreceiver.h" @@ -4371,6 +4373,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, xlogfname, recoveryStopAfter ? "after" : "before", timestamptz_to_str(recoveryStopTime)); +#ifdef PGXC else if (recoveryTarget == RECOVERY_TARGET_BARRIER) snprintf(buffer, sizeof(buffer), "%s%u\t%s\t%s %s\n", @@ -4379,6 +4382,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI, xlogfname, recoveryStopAfter ? "after" : "before", recoveryTargetBarrierId); +#endif else snprintf(buffer, sizeof(buffer), "%s%u\t%s\tno recovery target specified\n", @@ -5491,24 +5495,26 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis) return false; record_info = record->xl_info & ~XLR_INFO_MASK; +#ifdef PGXC if (record->xl_rmid == RM_XACT_ID) { - if (record_info == XLOG_XACT_COMMIT) - { - xl_xact_commit *recordXactCommitData; +#endif + if (record_info == XLOG_XACT_COMMIT) + { + xl_xact_commit *recordXactCommitData; - recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); - recordXtime = recordXactCommitData->xact_time; - } - else if (record_info == XLOG_XACT_ABORT) - { - xl_xact_abort *recordXactAbortData; + recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record); + recordXtime = recordXactCommitData->xact_time; + } + else if (record_info == XLOG_XACT_ABORT) + { + xl_xact_abort *recordXactAbortData; - recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); - recordXtime = recordXactAbortData->xact_time; - } + recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record); + recordXtime = recordXactAbortData->xact_time; } #ifdef PGXC + } /* end if (record->xl_rmid == RM_XACT_ID) */ else if (record->xl_rmid == RM_BARRIER_ID) { if (record_info == XLOG_BARRIER_CREATE) @@ -5880,10 +5886,12 @@ StartupXLOG(void) ereport(LOG, (errmsg("starting point-in-time recovery to %s", timestamptz_to_str(recoveryTargetTime)))); +#ifdef PGXC else if (recoveryTarget == RECOVERY_TARGET_BARRIER) ereport(LOG, (errmsg("starting point-in-time recovery to barrier %s", (recoveryTargetBarrierId)))); +#endif else ereport(LOG, (errmsg("starting archive recovery"))); diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 5557915..815a3d3 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2361,6 +2361,9 @@ _equalValue(Value *a, Value *b) } #ifdef PGXC +/* + * stuff from barrier.h + */ static bool _equalBarrierStmt(BarrierStmt *a, BarrierStmt *b) diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index d412a10..de0abec 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -7025,7 +7025,6 @@ opt_barrier_id: $$ = NULL; } ; - /* PGXC_END */ /***************************************************************************** diff --git a/src/backend/pgxc/barrier/Makefile b/src/backend/pgxc/barrier/Makefile index d80bbec..9505889 100644 --- a/src/backend/pgxc/barrier/Makefile +++ b/src/backend/pgxc/barrier/Makefile @@ -1,7 +1,7 @@ #------------------------------------------------------------------------- # # Makefile-- -# Makefile for pool +# Makefile for barrier # # Portions Copyright (c) 2010-2011 Nippon Telegraph and Telephone Corporation # diff --git a/src/backend/pgxc/barrier/barrier.c b/src/backend/pgxc/barrier/barrier.c index 1b44f36..32ff484 100644 --- a/src/backend/pgxc/barrier/barrier.c +++ b/src/backend/pgxc/barrier/barrier.c @@ -44,7 +44,7 @@ extern void ProcessCreateBarrierExecute(const char *id); * while all other backend starting a 2PC will grab the lock in shared * mode. So as long as we hold the exclusive lock, no other backend start a * new 2PC and there can not be any 2PC in-progress. This technique would - * rely on assumption that an exclsuive lock requester is not starved by + * rely on assumption that an exclusive lock requester is not starved by * share lock requesters. * * Note: To ensure that the 2PC are not blocked for a long time, we should @@ -76,7 +76,7 @@ ProcessCreateBarrierPrepare(const char *id) } /* - * Mark the completetion of an on-going barrier. We must have remembered the + * Mark the completion of an on-going barrier. We must have remembered the * barrier ID when we received the CREATE BARRIER PREPARE command */ void @@ -103,7 +103,7 @@ ProcessCreateBarrierEnd(const char *id) } /* - * Execute the CREATE BARRIER comamnd. Write a BARRIER WAL record and flush the + * Execute the CREATE BARRIER command. Write a BARRIER WAL record and flush the * WAL buffers to disk before returning to the caller. Writing the WAL record * does not guarantee successful completion of the barrier command. */ @@ -140,15 +140,15 @@ static const char * generate_barrier_id(const char *id) { /* - * TODO If the caller can passeed a NULL value, generate an id which is + * TODO If the caller can passed a NULL value, generate an id which is * guaranteed to be unique across the cluster. We can use a combination of * the coordinator node id and a timestamp. This may not be complete if we * support changing coordinator ids without initdb or the system clocks are * modified. * * Another option would be to let the GTM issue globally unique barrier - * IDs. For the time being, we leave it to the user to come up with an - * unique identifier + * IDs (GTM-timestamp based). For the time being, we leave it to the user + * to come up with an unique identifier. */ return id ? id : pstrdup("dummy_barrier_id"); } @@ -326,7 +326,7 @@ PrepareBarrier(const char *id) */ LWLockAcquire(BarrierLock, LW_EXCLUSIVE); - elog(DEBUG2, "Disabled 2PC commits origniating at the diriving coordinator"); + elog(DEBUG2, "Disabled 2PC commits originating at the driving coordinator"); /* * TODO Start a timer to cancel the barrier request in case of a timeout @@ -375,7 +375,7 @@ ExecuteBarrier(const char *id) if (handle->state != DN_CONNECTION_STATE_IDLE) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("Failed to send CREATE BARRIER PREPARE request " + errmsg("Failed to send CREATE BARRIER EXECUTE request " "to the node"))); barrier_idlen = strlen(id) + 1; diff --git a/src/backend/pgxc/pool/execRemote.c b/src/backend/pgxc/pool/execRemote.c index 0af1288..e14c284 100644 --- a/src/backend/pgxc/pool/execRemote.c +++ b/src/backend/pgxc/pool/execRemote.c @@ -1878,7 +1878,7 @@ PGXCNodeImplicitCommitPrepared(GlobalTransactionId prepare_xid, * We should acquire the BarrierLock in SHARE mode here to ensure that * there are no in-progress barrier at this point. This mechanism would * work as long as LWLock mechanism does not starve a EXCLUSIVE lock - * requesster + * requester */ LWLockAcquire(BarrierLock, LW_SHARED); res = pgxc_node_implicit_commit_prepared(prepare_xid, commit_xid, diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 7c9af6d..b7af28e 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -466,7 +466,7 @@ standard_ProcessUtility(Node *parsetree, * * XXX We call FinishPreparedTransaction inside * PGXCNodeCommitPrepared if we are doing a local - * operation. This is convinient because we want to + * operation. This is convenient because we want to * hold on to the BarrierLock until local transaction * is committed too. * diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index b152c36..df9c0ab 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -184,8 +184,11 @@ typedef enum { RECOVERY_TARGET_UNSET, RECOVERY_TARGET_XID, - RECOVERY_TARGET_TIME, + RECOVERY_TARGET_TIME +#ifdef PGXC + , RECOVERY_TARGET_BARRIER +#endif } RecoveryTargetType; extern XLogRecPtr XactLastRecEnd; diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 6ebf30b..5f16be6 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2303,7 +2303,6 @@ typedef struct BarrierStmt NodeTag type; const char *id; /* User supplied barrier id, if any */ } BarrierStmt; - #endif /* ---------------------- ----------------------------------------------------------------------- Summary of changes: src/backend/access/transam/xlog.c | 32 ++++++++++++++++++++------------ src/backend/nodes/equalfuncs.c | 3 +++ src/backend/parser/gram.y | 1 - src/backend/pgxc/barrier/Makefile | 2 +- src/backend/pgxc/barrier/barrier.c | 16 ++++++++-------- src/backend/pgxc/pool/execRemote.c | 2 +- src/backend/tcop/utility.c | 2 +- src/include/access/xlog.h | 5 ++++- src/include/nodes/parsenodes.h | 1 - 9 files changed, 38 insertions(+), 26 deletions(-) hooks/post-receive -- Postgres-XC |
From: Michael P. <mic...@us...> - 2011-06-06 02:55:24
|
Project "Postgres-XC". The branch, master has been updated via b47dc806218c3746ff5ab233de6d6728b6f0457d (commit) from 20361f0df1f93d2d80dce14c6a7258af0a684bd8 (commit) - Log ----------------------------------------------------------------- commit b47dc806218c3746ff5ab233de6d6728b6f0457d Author: Michael P <mic...@us...> Date: Mon Jun 6 11:53:36 2011 +0900 Partial fix for bug 3310399: Autovacuum workers using same connections to GTM This fixes a problem with autovacuum worker/launchers that tended to use the connection allocated for postmaster to connect to GTM. In the case of multiple vacuums running at the same time, this tended to mess the way autovacuum was receiving GXID and snapshots from GTM. This commit also adds some begus messages to look at the connection activity to GTM and more strict connection control of autovacuum backends to GTM. diff --git a/src/backend/access/transam/gtm.c b/src/backend/access/transam/gtm.c index e9f0229..f05e38d 100644 --- a/src/backend/access/transam/gtm.c +++ b/src/backend/access/transam/gtm.c @@ -28,17 +28,28 @@ extern bool FirstSnapshotSet; static GTM_Conn *conn; -#define CheckConnection() \ - if (GTMPQstatus(conn) != CONNECTION_OK) InitGTM() - bool IsGTMConnected() { return conn != NULL; } +static void +CheckConnection(void) +{ + /* Be sure that a backend does not use a postmaster connection */ + if (IsUnderPostmaster && GTMPQispostmaster(conn) == 1) + { + InitGTM(); + return; + } + + if (GTMPQstatus(conn) != CONNECTION_OK) + InitGTM(); +} + void -InitGTM() +InitGTM(void) { /* 256 bytes should be enough */ char conn_str[256]; @@ -55,10 +66,23 @@ InitGTM() sprintf(conn_str, "host=%s port=%d pgxc_node_id=%d remote_type=%d postmaster=1", GtmHost, GtmPort, PGXCNodeId, remote_type); + + /* Log activity of GTM connections */ + elog(DEBUG1, "Postmaster: connection established to GTM with string %s", conn_str); } else + { sprintf(conn_str, "host=%s port=%d pgxc_node_id=%d", GtmHost, GtmPort, PGXCNodeId); + /* Log activity of GTM connections */ + if (IsAutoVacuumWorkerProcess()) + elog(DEBUG1, "Autovacuum worker: connection established to GTM with string %s", conn_str); + else if (IsAutoVacuumLauncherProcess()) + elog(DEBUG1, "Autovacuum launcher: connection established to GTM with string %s", conn_str); + else + elog(DEBUG1, "Postmaster child: connection established to GTM with string %s", conn_str); + } + conn = PQconnectGTM(conn_str); if (GTMPQstatus(conn) != CONNECTION_OK) { @@ -79,6 +103,16 @@ CloseGTM(void) { GTMPQfinish(conn); conn = NULL; + + /* Log activity of GTM connections */ + if (!IsUnderPostmaster) + elog(DEBUG1, "Postmaster: connection to GTM closed"); + else if (IsAutoVacuumWorkerProcess()) + elog(DEBUG1, "Autovacuum worker: connection to GTM closed"); + else if (IsAutoVacuumLauncherProcess()) + elog(DEBUG1, "Autovacuum launcher: connection to GTM closed"); + else + elog(DEBUG1, "Postmaster child: connection to GTM closed"); } GlobalTransactionId @@ -114,7 +148,8 @@ BeginTranAutovacuumGTM(void) if (conn) xid = begin_transaction_autovacuum(conn, GTM_ISOLATION_RC); - /* If something went wrong (timeout), try and reset GTM connection and retry. + /* + * If something went wrong (timeout), try and reset GTM connection and retry. * This is safe at the beginning of a transaction. */ if (!TransactionIdIsValid(xid)) @@ -147,6 +182,11 @@ CommitTranGTM(GlobalTransactionId gxid) CloseGTM(); InitGTM(); } + + /* Close connection in case commit is done by autovacuum worker or launcher */ + if (IsAutoVacuumWorkerProcess() || IsAutoVacuumLauncherProcess()) + CloseGTM(); + return ret; } diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index c6f56b7..e939bc0 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -232,8 +232,7 @@ static void DataNodeShutdown (int code, Datum arg) { /* Close connection with GTM, if active */ - if (IsAutoVacuumWorkerProcess()) - CloseGTM(); + CloseGTM(); } #endif diff --git a/src/gtm/client/fe-connect.c b/src/gtm/client/fe-connect.c index 52ce93c..95954b9 100644 --- a/src/gtm/client/fe-connect.c +++ b/src/gtm/client/fe-connect.c @@ -1254,6 +1254,14 @@ GTMPQstatus(const GTM_Conn *conn) return conn->status; } +int +GTMPQispostmaster(const GTM_Conn *conn) +{ + if (!conn) + return 0; + return conn->is_postmaster; +} + char * GTMPQerrorMessage(const GTM_Conn *conn) { diff --git a/src/include/gtm/libpq-fe.h b/src/include/gtm/libpq-fe.h index b3ef655..0603987 100644 --- a/src/include/gtm/libpq-fe.h +++ b/src/include/gtm/libpq-fe.h @@ -119,6 +119,7 @@ extern void GTMPQconninfoFree(GTMPQconninfoOption *connOptions); extern char *GTMPQhost(const GTM_Conn *conn); extern char *GTMPQport(const GTM_Conn *conn); extern ConnStatusType GTMPQstatus(const GTM_Conn *conn); +extern int GTMPQispostmaster(const GTM_Conn *conn); extern char *GTMPQerrorMessage(const GTM_Conn *conn); extern int GTMPQsocket(const GTM_Conn *conn); ----------------------------------------------------------------------- Summary of changes: src/backend/access/transam/gtm.c | 50 ++++++++++++++++++++++++++++++++++---- src/backend/tcop/postgres.c | 3 +- src/gtm/client/fe-connect.c | 8 ++++++ src/include/gtm/libpq-fe.h | 1 + 4 files changed, 55 insertions(+), 7 deletions(-) hooks/post-receive -- Postgres-XC |
From: Koichi S. <koi...@us...> - 2011-06-02 10:37:25
|
Project "Postgres-XC". The branch, ha_support has been updated via ffcc0f293e55381f57bee8e47ec0b332f518e1e1 (commit) from 72b6a15e6defb2ca42d8258df4507eb8a596dba6 (commit) - Log ----------------------------------------------------------------- commit ffcc0f293e55381f57bee8e47ec0b332f518e1e1 Author: Koichi Suzuki <koi...@gm...> Date: Thu Jun 2 19:33:08 2011 +0900 This commit confirmed that each GTM-Proxy worker threads reconnect to promoted GTM. Only "reconnect" was tested and it is not harmful in usual use. Modified file is: modified: proxy/proxy_main.c Next, GTM-Proxy command backup will be tested. In parallel, correction of transaction backup of by GTM-Standby will be implemented. diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index ef79885..4b9e122 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -142,6 +142,7 @@ static void RegisterProxy(bool reconnect_opt); static void UnregisterProxy(void); static GTM_Conn *ConnectGTM(void); static void ReleaseCmdBackup(GTMProxy_CommandInfo *cmdinfo); +static void workerThreadReconnectToGTMstandby(void); /* * One-time initialization. It's called immediately after the main process @@ -937,6 +938,7 @@ GTMProxy_ThreadMain(void *argp) int32 saved_seqno = -1; int ii, nrfds; char gtm_connect_string[1024]; + int first_turn = TRUE; /* Used only to set longjmp target at the first turn of thread loop */ elog(DEBUG3, "Starting the connection helper thread"); @@ -1085,86 +1087,95 @@ GTMProxy_ThreadMain(void *argp) MemoryContextResetAndDeleteChildren(MessageContext); /* - * Just reset the input buffer to avoid repeated palloc/pfrees - * - * XXX We should consider resetting the MessageContext periodically to - * handle any memory leaks + * The following block should be skipped at the first turn. */ - resetStringInfo(&input_message); - - /* - * Check if there are any changes to the connection array assigned to - * this thread. If so, we need to rebuild the fd array. - */ - GTM_MutexLockAcquire(&thrinfo->thr_lock); - if (saved_seqno != thrinfo->thr_seqno) + if (!first_turn) { - saved_seqno = thrinfo->thr_seqno; - - while (thrinfo->thr_conn_count <= 0) - { - /* - * No connections assigned to the thread. Wait for at least one - * connection to be assgined to us - */ - GTM_CVWait(&thrinfo->thr_cv, &thrinfo->thr_lock); - } - - memset(thrinfo->thr_poll_fds, 0, sizeof (thrinfo->thr_poll_fds)); - /* - * Now grab all the open connections. We are holding the lock so no - * new connections can be added. + * Check if there are any changes to the connection array assigned to + * this thread. If so, we need to rebuild the fd array. */ - for (ii = 0; ii < thrinfo->thr_conn_count; ii++) + GTM_MutexLockAcquire(&thrinfo->thr_lock); + if (saved_seqno != thrinfo->thr_seqno) { - GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii]; + saved_seqno = thrinfo->thr_seqno; - /* We detect if the connection has been dropped to avoid - * a segmentation fault. - */ - if (conninfo->con_port == NULL) + while (thrinfo->thr_conn_count <= 0) { - conninfo->con_disconnected = true; - continue; - } + /* + * No connections assigned to the thread. Wait for at least one + * connection to be assgined to us + */ + if (sigsetjmp(GetMyThreadInfo->longjmp_env, 1) == 0) + { + Enable_Longjmp(); + GTM_CVWait(&thrinfo->thr_cv, &thrinfo->thr_lock); + Disable_Longjmp(); + } + else + { + /* SIGUSR2 here */ + workerThreadReconnectToGTMstandby(); + } + } + + memset(thrinfo->thr_poll_fds, 0, sizeof (thrinfo->thr_poll_fds)); /* - * If this is a newly added connection, complete the handshake + * Now grab all the open connections. We are holding the lock so no + * new connections can be added. */ - if (!conninfo->con_authenticated) - GTMProxy_HandshakeConnection(conninfo); + for (ii = 0; ii < thrinfo->thr_conn_count; ii++) + { + GTMProxy_ConnectionInfo *conninfo = thrinfo->thr_all_conns[ii]; - thrinfo->thr_poll_fds[ii].fd = conninfo->con_port->sock; - thrinfo->thr_poll_fds[ii].events = POLLIN; - thrinfo->thr_poll_fds[ii].revents = 0; - } - } - GTM_MutexLockRelease(&thrinfo->thr_lock); + /* We detect if the connection has been dropped to avoid + * a segmentation fault. + */ + if (conninfo->con_port == NULL) + { + conninfo->con_disconnected = true; + continue; + } - while (true) - { - nrfds = poll(thrinfo->thr_poll_fds, thrinfo->thr_conn_count, 1000); + /* + * If this is a newly added connection, complete the handshake + */ + if (!conninfo->con_authenticated) + GTMProxy_HandshakeConnection(conninfo); - if (nrfds < 0) - { - if (errno == EINTR) - continue; - elog(FATAL, "poll returned with error %d", nrfds); + thrinfo->thr_poll_fds[ii].fd = conninfo->con_port->sock; + thrinfo->thr_poll_fds[ii].events = POLLIN; + thrinfo->thr_poll_fds[ii].revents = 0; + } } - else - break; - } + GTM_MutexLockRelease(&thrinfo->thr_lock); - if (nrfds == 0) - continue; + while (true) + { + Enable_Longjmp(); + nrfds = poll(thrinfo->thr_poll_fds, thrinfo->thr_conn_count, 1000); + Disable_Longjmp(); - /* - * Initialize the lists - */ - thrinfo->thr_processed_commands = gtm_NIL; - memset(thrinfo->thr_pending_commands, 0, sizeof (thrinfo->thr_pending_commands)); + if (nrfds < 0) + { + if (errno == EINTR) + continue; + elog(FATAL, "poll returned with error %d", nrfds); + } + else + break; + } + if (nrfds == 0) + continue; + + /* + * Initialize the lists + */ + thrinfo->thr_processed_commands = gtm_NIL; + memset(thrinfo->thr_pending_commands, 0, sizeof (thrinfo->thr_pending_commands)); + } /* * Each SIGUSR2 should return here and please note that from the the beginning @@ -1173,7 +1184,7 @@ GTMProxy_ThreadMain(void *argp) * sure to be in MemoryContext where siglongjmp() is issued. */ setjmp_again: - if (sigsetjmp(GetMyThreadInfo->longjmp_env, 1) == 0) + if (sigsetjmp(thrinfo->longjmp_env, 1) == 0) { Disable_Longjmp(); } @@ -1182,33 +1193,14 @@ GTMProxy_ThreadMain(void *argp) /* * SIGUSR2 is detected and jumped here */ - /* - * First of all, we should acquire reconnect control lock in READ mode. - */ - GTM_RWLockAcquire(&ReconnectControlLock, GTM_LOCKMODE_READ); - PG_SETMASK(&UnBlockSig); - /* - * Disconnect the current connection and re-connect to the new GTM + /* + * Reconnect */ - GTMPQfinish(thrinfo->thr_gtm_conn); - sprintf(gtm_connect_string, "host=%s port=%d pgxc_node_id=%d remote_type=%d", - NewGTMServerHost, NewGTMServerPortNumber, GTMProxyID, PGXC_NODE_GTM_PROXY); - thrinfo->thr_gtm_conn = PQconnectGTM(gtm_connect_string); - - if (thrinfo->thr_gtm_conn == NULL) - elog(FATAL, "GTM connection failed."); + workerThreadReconnectToGTMstandby(); /* - * Set GTM communication error handling option + * Correction of pending works. */ - thrinfo->thr_gtm_conn->gtmErrorWaitOpt = GTMErrorWaitOpt; - thrinfo->thr_gtm_conn->gtmErrorWaitSecs = GTMErrorWaitSecs; - thrinfo->thr_gtm_conn->gtmErrorWaitCount = GTMErrorWaitCount; - - /* - * Initialize the command processing - */ - thrinfo->reconnect_issued = FALSE; thrinfo->thr_processed_commands = gtm_NIL; for (ii = 0; ii < MSG_TYPE_COUNT; ii++) { @@ -1216,12 +1208,21 @@ GTMProxy_ThreadMain(void *argp) } gtm_list_free_deep(thrinfo->thr_processed_commands); thrinfo->thr_processed_commands = gtm_NIL; - /* - * Release the reconnect control lock - */ - GTM_RWLockRelease(&ReconnectControlLock); goto setjmp_again; /* Get ready for another SIGUSR2 */ } + if (first_turn) + { + first_turn = FALSE; + continue; + } + + /* + * Just reset the input buffer to avoid repeated palloc/pfrees + * + * XXX We should consider resetting the MessageContext periodically to + * handle any memory leaks + */ + resetStringInfo(&input_message); /* * Now, read command from each of the connections that has some data to @@ -2960,6 +2961,42 @@ static void ReleaseCmdBackup(GTMProxy_CommandInfo *cmdinfo) } #endif +static void +workerThreadReconnectToGTMstandby(void) +{ + char gtm_connect_string[1024]; + + /* + * First of all, we should acquire reconnect control lock in READ mode + * to wait for the main thread to finish reconnect. + */ + GTM_RWLockAcquire(&ReconnectControlLock, GTM_LOCKMODE_READ); + GTM_RWLockRelease(&ReconnectControlLock); /* The lock not needed any longer */ + PG_SETMASK(&UnBlockSig); + /* + * Disconnect the current connection and re-connect to the new GTM + */ + GTMPQfinish(GetMyThreadInfo->thr_gtm_conn); + sprintf(gtm_connect_string, "host=%s port=%d pgxc_node_id=%d remote_type=%d", + NewGTMServerHost, NewGTMServerPortNumber, GTMProxyID, PGXC_NODE_GTM_PROXY); + GetMyThreadInfo->thr_gtm_conn = PQconnectGTM(gtm_connect_string); + + if (GetMyThreadInfo->thr_gtm_conn == NULL) + elog(FATAL, "GTM connection failed."); + + /* + * Set GTM communication error handling option + */ + GetMyThreadInfo->thr_gtm_conn->gtmErrorWaitOpt = GTMErrorWaitOpt; + GetMyThreadInfo->thr_gtm_conn->gtmErrorWaitSecs = GTMErrorWaitSecs; + GetMyThreadInfo->thr_gtm_conn->gtmErrorWaitCount = GTMErrorWaitCount; + + /* + * Initialize the command processing + */ + GetMyThreadInfo->reconnect_issued = FALSE; +} + /* * dummy function to avoid compile error. */ ----------------------------------------------------------------------- Summary of changes: src/gtm/proxy/proxy_main.c | 219 ++++++++++++++++++++++++++------------------ 1 files changed, 128 insertions(+), 91 deletions(-) hooks/post-receive -- Postgres-XC |
From: Koichi S. <koi...@us...> - 2011-06-02 06:37:43
|
Project "Postgres-XC". The branch, ha_support has been updated via 72b6a15e6defb2ca42d8258df4507eb8a596dba6 (commit) from 612265336a7d8bc0666983f8b4d6c165aa15efb5 (commit) - Log ----------------------------------------------------------------- commit 72b6a15e6defb2ca42d8258df4507eb8a596dba6 Author: Koichi Suzuki <koi...@gm...> Date: Thu Jun 2 15:33:14 2011 +0900 This commit fixes one bug and adds one feature 1. Fixed backup node registration from GTM-Main to GTM-Standby. Node name and working directory serialization was incorrect. 2. RegisterProxy() was extended to accept a bool argument to specify if it is called at reconnect or not. If reconnect, it explicity changes memory context to topMemroyContext and clears existing connection to old gtm and updates GTM host and port to standby before connection. Afterwords, the connection is done in the same way and finally, MemoryContext is restored to the original status. Affected files are: modified: common/gtm_serialize.c modified: proxy/proxy_main.c diff --git a/src/gtm/common/gtm_serialize.c b/src/gtm/common/gtm_serialize.c index 85c7233..d4daf7d 100644 --- a/src/gtm/common/gtm_serialize.c +++ b/src/gtm/common/gtm_serialize.c @@ -769,7 +769,7 @@ gtm_serialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, char *buf, size_t buflen) len += sizeof(uint32); if (len_wk > 0) { - memcpy(buf+len, &(data->ipaddress), len_wk); + memcpy(buf+len, data->ipaddress, len_wk); len += len_wk; } @@ -782,7 +782,7 @@ gtm_serialize_pgxcnodeinfo(GTM_PGXCNodeInfo *data, char *buf, size_t buflen) len += sizeof(uint32); if (len_wk > 0) { - memcpy(buf+len, &(data->datafolder), len_wk); + memcpy(buf+len, data->datafolder, len_wk); len += len_wk; } diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index e0b5ca9..ef79885 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -138,7 +138,7 @@ static void CreateLockFile(const char *filename, const char *refName); static void ChangeToDataDir(void); static void checkDataDir(void); static void DeleteLockFile(const char *filename); -static void RegisterProxy(void); +static void RegisterProxy(bool reconnect_opt); static void UnregisterProxy(void); static GTM_Conn *ConnectGTM(void); static void ReleaseCmdBackup(GTMProxy_CommandInfo *cmdinfo); @@ -215,7 +215,7 @@ BaseInit() Recovery_SaveRegisterFileName(GTMProxyDataDir); /* Register Proxy on GTM */ - RegisterProxy(); + RegisterProxy(FALSE); DebugFileOpen(); @@ -406,12 +406,16 @@ GTMProxy_SigleHandler(int signal) /* * Send SIGUSR2 to all worker threads. */ + /* + * Check if all the worker thread can accept SIGUSR2 + */ for (ii = 0; ii < GTMProxyWorkerThreads; ii++) { if ((Proxy_ThreadInfo[ii] == NULL) || (Proxy_ThreadInfo[ii]->can_accept_SIGUSR2 == FALSE)) { - elog(LOG, "Some thread is not ready to accept SIGUSR2. SIGUSR1 ignored."); + elog(NOTICE, "Some worker thread is not ready to handle this. Retry reconnect later.\n"); PG_SETMASK(&UnBlockSig); + return; } } /* @@ -454,7 +458,9 @@ GTMProxy_SigleHandler(int signal) /* * This should not be reached. Just in case. */ +#ifdef GTM_SBY_DEBUG elog(LOG, "SIGUSR2 received by the main thread. Ignoring."); +#endif PG_SETMASK(&UnBlockSig); return; } @@ -798,10 +804,9 @@ ServerLoop(void) * Because we leave the old socket as is, there could be some waste of * the resource but this may not happen so many times. */ - free(GTMServerHost); - GTMServerHost = NewGTMServerHost; - GTMServerPortNumber = NewGTMServerPortNumber; - RegisterProxy(); + + RegisterProxy(TRUE); + /* * If it is done, then release the lock for worker threads. */ @@ -2825,9 +2830,13 @@ failed: /* * Register Proxy on GTM + * + * If reconnect is specified, then existing connection is closed + * and the target GTM is taken from NewGTMServerHost and + * NewGTMServerPortNumber. */ static void -RegisterProxy(void) +RegisterProxy(bool reconnect_opt) { GTM_PGXCNodeType type = PGXC_NODE_GTM_PROXY; GTM_PGXCNodePort port = (GTM_PGXCNodePort) GTMProxyPortNumber; @@ -2835,6 +2844,26 @@ RegisterProxy(void) GTM_PGXCNodeId proxynum = 0; time_t finish_time; + MemoryContext old_mcxt; + + if (reconnect_opt) + { + elog(NOTICE, + "Reconnect to new GTM, hostname=%s, port=%d", + NewGTMServerHost, NewGTMServerPortNumber); + /* + * Now reconnect. Close the exising connection + * and update the target host and port. + */ + /* First, change the memory context to TopMemoryContext */ + old_mcxt = MemoryContextSwitchTo(TopMemoryContext); + + /* Change the target to new GTM */ + GTMPQfinish(master_conn); + GTMServerHost = NewGTMServerHost; + GTMServerPortNumber = NewGTMServerPortNumber; + } + master_conn = ConnectGTM(); if (!master_conn) goto failed; @@ -2878,6 +2907,9 @@ RegisterProxy(void) Assert(res->gr_resdata.grd_node.nodenum == GTMProxyID); } + /* If reconnect, restore the old memory context */ + if (reconnect_opt) + MemoryContextSwitchTo(old_mcxt); return; failed: ----------------------------------------------------------------------- Summary of changes: src/gtm/common/gtm_serialize.c | 4 +- src/gtm/proxy/proxy_main.c | 48 +++++++++++++++++++++++++++++++++------ 2 files changed, 42 insertions(+), 10 deletions(-) hooks/post-receive -- Postgres-XC |
From: Michael P. <mic...@us...> - 2011-06-02 00:32:38
|
Project "Postgres-XC". The branch, master has been updated via 20361f0df1f93d2d80dce14c6a7258af0a684bd8 (commit) from 4a81fdc503a2a7cccc7610cc54db94d4e2d1b857 (commit) - Log ----------------------------------------------------------------- commit 20361f0df1f93d2d80dce14c6a7258af0a684bd8 Author: Michael P <mic...@us...> Date: Thu Jun 2 09:33:34 2011 +0900 Correct a couple of comments in optimizer Comments are reformated to be more Postgres-like. diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index b73c08e..f1f9192 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -4779,7 +4779,7 @@ findReferencedVars(List *parent_vars, Plan *plan, List **out_tlist, Relids *out_ relids = bms_add_member(relids, var->varno); } - /* now consider the local quals */ + /* Now consider the local quals */ vars = pull_var_clause((Node *)plan->qual, PVC_REJECT_PLACEHOLDERS); foreach(l, vars) @@ -4896,7 +4896,7 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) else { /* - * multi-table case + * Multi table case * Assuming the RemoteQuery is created in create_remotejoin_plan(). * If the final RemoteQuery is for correlated delete outer_statement * is just a SELECT FROM target_table, outer_statement is correlated @@ -4919,10 +4919,12 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) /* Append WHERE clause */ appendStringInfoString(buf, fstep->join_condition); } - /* replace step query */ + + /* Replace step query */ pfree(fstep->sql_statement); fstep->sql_statement = pstrdup(buf->data); - /* set combine_type, it is COMBINE_TYPE_NONE for SELECT */ + + /* Set combine_type, it is COMBINE_TYPE_NONE for SELECT */ fstep->combine_type = rel_loc_info->locatorType == LOCATOR_TYPE_REPLICATED ? COMBINE_TYPE_SAME : COMBINE_TYPE_SUM; fstep->read_only = false; @@ -4973,7 +4975,7 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) { Form_pg_attribute att_tup = (Form_pg_attribute) GETSTRUCT(tp); - /* add comma before all except first attributes */ + /* Add comma before all except first attributes */ if (att > 1) { appendStringInfoString(xbuf, ", "); @@ -4994,12 +4996,12 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) att, ttab->relid); } - /* complete SELECT command */ + /* Complete SELECT command */ appendStringInfo(xbuf, " FROM %s.%s WHERE ctid = $1", quote_identifier(nspname), quote_identifier(ttab->relname)); - /* build up the extra select step */ + /* Build up the extra select step */ xstep = make_remotequery(xtlist, ttab, NIL, ttab->relid); innerPlan(xstep) = topplan; xstep->sql_statement = pstrdup(xbuf->data); @@ -5012,14 +5014,14 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) xstep->exec_nodes->relid = ttab->relid; xstep->exec_nodes->accesstype = RELATION_ACCESS_READ; - /* first and only target entry of topplan is ctid, reference it */ + /* First and only target entry of topplan is ctid, reference it */ ctid = makeVar(INNER, 1, TIDOID, -1, 0); xstep->exec_nodes->expr = (Expr *) ctid; pfree(xbuf->data); pfree(xbuf); - /* build up the final delete step */ + /* Build up the final delete step */ innerPlan(fstep) = (Plan *) xstep; fstep->sql_statement = pstrdup(buf->data); fstep->combine_type = COMBINE_TYPE_SAME; @@ -5028,7 +5030,7 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) } else { - /* build up the final delete step */ + /* Build up the final delete step */ innerPlan(fstep) = topplan; appendStringInfoString(buf, " WHERE ctid = $1"); fstep->sql_statement = pstrdup(buf->data); @@ -5042,7 +5044,7 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) fstep->exec_nodes->relid = ttab->relid; fstep->exec_nodes->accesstype = RELATION_ACCESS_UPDATE; - /* first and only target entry of topplan is ctid, reference it */ + /* First and only target entry of topplan is ctid, reference it */ ctid = makeVar(INNER, 1, TIDOID, -1, 0); fstep->exec_nodes->expr = (Expr *) ctid; } @@ -5055,7 +5057,7 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) /* * create_remoteagg_plan - * tries to see if the grouping and aggregates can be pushed down to the + * Check if the grouping and aggregates can be pushed down to the * datanodes. * Right now we can push with following restrictions * 1. there are plain aggregates (no expressions involving aggregates) and/or @@ -5098,13 +5100,13 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) * the Agg plan */ - /* for now only Agg plans */ + /* For now only Agg plans */ Assert(IsA(local_plan, Agg)); agg_plan = (Agg *)local_plan; /* - * we don't push aggregation and grouping to datanodes, in case there are - * windowing aggregates, distinct, having clause or sort clauses + * We don't push aggregation and grouping to datanodes, in case there are + * windowing aggregates, distinct, having clause or sort clauses. */ if (query->hasWindowFuncs || query->distinctClause || @@ -5113,10 +5115,10 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) return local_plan; /* - * optimize if only the tree underneath is reduced to RemoteQuery, any other + * Optimize if only the tree underneath is reduced to RemoteQuery, any other * node there indicates that the scans can not be completely pushed to the - * remote data nodes - * RemoteQuery is hidden underneath Material plan, take it out + * remote data nodes. + * RemoteQuery is hidden underneath Material plan, take it out. */ if (IsA(temp_plan, Material)) temp_plan = temp_plan->lefttree; @@ -5125,9 +5127,8 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) else agg_left = (RemoteQuery *)temp_plan; - /* - * walk through the target list and find out whether we can push the + * Walk through the target list and find out whether we can push the * aggregates and grouping to datanodes. We can do so if the target list * contains plain aggregates (without any expression involving those) and * expressions in group by clauses only (last one to make the query legit. @@ -5146,25 +5147,25 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) } /* - * cleared of all the charges, now take following steps - * 1. create a remote query node reflecting the query to be pushed to the + * Cleared of all the charges, now take following steps + * 1. Create a remote query node reflecting the query to be pushed to the * datanode * 2. Modify the Agg node passed in so that it reflects the aggregation * (collection) to be done at the coordinator based on the results sent by * the datanodes. */ - appendStringInfo(in_alias, "%s_%d", "group", root->rs_alias_index); - /* find all the relations referenced by targetlist of Agg node */ + /* Find all the relations referenced by targetlist of Agg node */ temp_vars = pull_var_clause((Node *)agg_tlist, PVC_REJECT_PLACEHOLDERS); findReferencedVars(temp_vars, (Plan *)agg_left, &temp_vartlist, &in_relids); /* - * build partial RemoteQuery node to be used for creating the Select clause + * Build partial RemoteQuery node to be used for creating the Select clause * to be sent to the remote node. Rest of the node will be built later */ agg_remote = makeNode(RemoteQuery); + /* * Save information about the plan we are reducing. * We may need this information later if more entries are added to it @@ -5176,10 +5177,11 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) agg_remote->inner_relids = in_relids; agg_remote->inner_statement = pstrdup(agg_left->sql_statement); agg_remote->exec_nodes = agg_left->exec_nodes; - /* don't forget to increment the index for the next time around! */ + + /* Don't forget to increment the index for the next time around! */ agg_remote->reduce_level = root->rs_alias_index++; - /* generate the select clause of the remote query */ + /* Generate the select clause of the remote query */ appendStringInfoString(remote_targetlist, "SELECT"); foreach (temp, agg_tlist) { @@ -5187,18 +5189,19 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) Node *expr = (Node *)tle->expr; create_remote_expr(root, local_plan, remote_targetlist, expr, agg_remote); - /* if this is not last target entry, add a comma with space */ + + /* If this is not last target entry, add a comma with space */ if (lnext(temp)) appendStringInfoString(remote_targetlist, ","); } - /* generate the from clause of the remote query */ + /* Generate the from clause of the remote query */ appendStringInfo(remote_fromlist, "FROM (%s) %s", agg_remote->inner_statement, agg_remote->inner_alias); /* - * generate group by clause for the remote query and recompute the group by - * column locations + * Generate group by clause for the remote query and recompute the group by + * columE.n locations */ if (query->groupClause) { @@ -5206,11 +5209,11 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) Assert(IsA(local_plan, Agg)); /* - * recompute the column ids of the grouping columns, + * Recompute the column ids of the grouping columns, * the group column indexes computed earlier point in the * targetlists of the scan plans under this node. But now the grouping * column indexes will be pointing in the targetlist of the new - * RemoteQuery, hence those need to be recomputed + * RemoteQuery, hence those need to be recomputed. */ pgxc_locate_grouping_columns(root, agg_tlist, agg_plan->grpColIdx); @@ -5224,18 +5227,20 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) } } - /* generate the remote sql statement from the pieces */ + /* Generate the remote sql statement from the pieces */ appendStringInfo(remote_sql_stmt, "%s %s %s", remote_targetlist->data, remote_fromlist->data, groupby_clause->data); + /* - * set the base_tlist for the RemoteQuery node being created, it's used to + * Set the base_tlist for the RemoteQuery node being created, it's used to * create the tuple descriptor for the result from RemoteQuery and rewrite * the Aggregates targetlist accept the results of the RemoteQuery. */ base_tlist = add_to_flat_tlist(NIL, get_tlist_exprs(agg_tlist, true)); + /* - * we need to change the return types of the aggregates. Datanodes send the - * aggregation results in the form of transition results + * We need to change the return types of the aggregates. Datanodes send the + * aggregation results in the form of transition results. */ foreach (temp, base_tlist) { @@ -5251,7 +5256,7 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) } /* - * create a dummy RTE for the remote query being created. Append the dummy + * Create a dummy RTE for the remote query being created. Append the dummy * range table entry to the range table. Note that this modifies the master * copy the caller passed us, otherwise e.g EXPLAIN VERBOSE will fail to * find the rte the Vars built below refer to. Also create the tuple @@ -5261,17 +5266,19 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) dummy_rte = makeNode(RangeTblEntry); dummy_rte->reltupdesc = ExecTypeFromTL(base_tlist, false); dummy_rte->rtekind = RTE_RELATION; - /* use a dummy relname... */ + + /* Use a dummy relname... */ dummy_rte->relname = "__FOREIGN_QUERY__"; dummy_rte->eref = makeAlias("__FOREIGN_QUERY__", NIL); - /* rest will be zeroed out in makeNode() */ + + /* Rest will be zeroed out in makeNode() */ root->parse->rtable = lappend(root->parse->rtable, dummy_rte); dummy_rtindex = list_length(root->parse->rtable); /* - * change the aggref nodes in the local Agg plan to accept the transition + * Change the aggref nodes in the local Agg plan to accept the transition * results from the remote query output. Do this after we have created base - * list, otherwise we might introduce these changes in the base list + * list, otherwise we might introduce these changes in the base list. * Do this after the RTE for the remote query is added to the root. */ forboth (temp, agg_tlist, temp_remote, base_tlist) @@ -5287,10 +5294,10 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) Assert(IsA(expr_remote, Aggref)); /* - * replace the args of the local Aggref with Aggref node to be + * Replace the args of the local Aggref with Aggref node to be * included in RemoteQuery node, so that set_plan_refs can convert * the args into VAR pointing to the appropriate result in the tuple - * coming from RemoteQuery node + * coming from RemoteQuery node. * PGXCTODO: should we push this change in targetlists of plans * above? */ @@ -5299,18 +5306,21 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) } } - /* build rest of the RemoteQuery node and the plan there */ + /* Build rest of the RemoteQuery node and the plan there */ agg_remote_plan = &agg_remote->scan.plan; - /* the join targetlist becomes this node's tlist */ + + /* The join targetlist becomes this node's tlist */ agg_remote_plan->targetlist = base_tlist; agg_remote_plan->lefttree = NULL; agg_remote_plan->righttree = NULL; agg_remote->scan.scanrelid = dummy_rtindex; agg_remote->sql_statement = remote_sql_stmt->data; + /* set_plan_refs needs this later */ agg_remote->base_tlist = base_tlist; agg_remote->relname = "__FOREIGN_QUERY__"; agg_remote->partitioned_replicated = agg_left->partitioned_replicated; + /* * Only quals that can be pushed to the remote side the ones in the having * clause. Till we work out how to handle having quals in XC, we don't have @@ -5319,20 +5329,21 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) * node, may have local quals. In such case, we have to aggregate and group * at coordinator and can not push the grouping clause to the datanodes. Is * there a case in XC, where we can have local quals? - * we actually need not worry about costs since this is the final plan + * We actually need not worry about costs since this is the final plan. */ agg_remote_plan->startup_cost = agg_left->scan.plan.startup_cost; agg_remote_plan->total_cost = agg_left->scan.plan.total_cost; agg_remote_plan->plan_rows = agg_left->scan.plan.plan_rows; agg_remote_plan->plan_width = agg_left->scan.plan.plan_width; - /* modify the passed in Agg plan according to the remote query we built */ /* - * Materialization is always need for RemoteQuery in case we need to restart - * the scan + * Modify the passed in Agg plan according to the remote query we built. + * Materialization is always needed for RemoteQuery in case we need to restart + * the scan. */ agg_plan->plan.lefttree = (Plan *) make_material(agg_remote_plan); - /* indicate that we should apply collection function directly */ + + /* Indicate that we should apply collection function directly */ agg_plan->skip_trans = true; return (Plan *)agg_plan; @@ -5340,7 +5351,7 @@ create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) /* * create_remotegroup_plan - * given a Group plan, try to push as much of the query to the datanodes and + * Given a Group plan, try to push as much of the query to the datanodes and * build a Group plan to combiner the results across the datanodes. The Sort * node under the Group plan is pushed down to RemoteQuery plan, since the * combiner knows how to merge the results across datanodes in sorted manner. @@ -5385,13 +5396,14 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) query->hasWindowFuncs) return local_plan; - /* for now only for Group plans */ + /* For now only for Group plans are treated */ Assert(IsA(local_plan, Group)); group_plan = (Group *)local_plan; remote_scan = NULL; temp_plan = local_plan->lefttree; + /* - * we expect plan tree as Group->Sort->{Result}?->{Material}?->RemoteQuery, + * We expect plan tree as Group->Sort->{Result}?->{Material}?->RemoteQuery, * anything else is not handled right now. */ if (IsA(temp_plan, Sort)) @@ -5411,6 +5423,7 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) Assert(IsA(remote_scan, RemoteQuery)); Assert(IsA(sort_plan, Sort)); + /* * grouping_planner will add Sort node before Group node to sort the rows * based on the columns in GROUP BY clause. Hence the columns in Sort and @@ -5425,26 +5438,28 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) if (sort_plan->sortColIdx[cntCols] != group_plan->grpColIdx[cntCols]) return local_plan; } + /* * We are now ready to create the RemoteQuery node to push the query to * datanode. - * 1. create a remote query node reflecting the query to be pushed to the - * datanode + * 1. Create a remote query node reflecting the query to be pushed to the + * datanode. * 2. Modify the Group node passed in, to accept the results sent by the - * datanodes and group them + * datanodes and group them. */ - local_tlist = local_plan->targetlist; appendStringInfo(in_alias, "%s_%d", "group", root->rs_alias_index); - /* find all the relations referenced by targetlist of Group node */ + /* Find all the relations referenced by targetlist of Group node */ temp_vars = pull_var_clause((Node *)local_tlist, PVC_REJECT_PLACEHOLDERS); findReferencedVars(temp_vars, (Plan *)remote_scan, &temp_vartlist, &in_relids); + /* - * build partial RemoteQuery node to be used for creating the Select clause + * Build partial RemoteQuery node to be used for creating the Select clause * to be sent to the remote node. Rest of the node will be built later */ remote_group = makeNode(RemoteQuery); + /* * Save information about the plan we are reducing. * We may need this information later if more entries are added to it @@ -5456,10 +5471,11 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) remote_group->inner_relids = in_relids; remote_group->inner_statement = pstrdup(remote_scan->sql_statement); remote_group->exec_nodes = remote_scan->exec_nodes; - /* don't forget to increment the index for the next time around! */ + + /* Don't forget to increment the index for the next time around! */ remote_group->reduce_level = root->rs_alias_index++; - /* generate the select clause of the remote query */ + /* Generate the select clause of the remote query */ appendStringInfoString(remote_targetlist, "SELECT"); foreach (temp, local_tlist) { @@ -5467,17 +5483,18 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) Node *expr = (Node *)tle->expr; create_remote_expr(root, local_plan, remote_targetlist, expr, remote_group); - /* if this is not last target entry, add a comma with space */ + + /* If this is not last target entry, add a comma with space */ if (lnext(temp)) appendStringInfoString(remote_targetlist, ","); } - /* generate the from clause of the remote query */ + /* Generate the from clause of the remote query */ appendStringInfo(remote_fromlist, "FROM (%s) %s", remote_group->inner_statement, remote_group->inner_alias); /* - * generate group by clause for the remote query and recompute the group by + * Generate group by clause for the remote query and recompute the group by * column locations. We want the tuples from remote node to be ordered by * the grouping columns so that ExecGroup can work without any modification, * hence create a SimpleSort structure to be added to RemoteQuery (which @@ -5490,7 +5507,7 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) char *sep = ""; /* - * reuse the arrays allocated in sort_plan to create SimpleSort + * Reuse the arrays allocated in sort_plan to create SimpleSort * structure. sort_plan is useless henceforth. */ remote_sort->numCols = group_plan->numCols; @@ -5514,75 +5531,86 @@ create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) remote_group->sort = remote_sort; } - /* generate the remote sql statement from the pieces */ + /* Generate the remote sql statement from the pieces */ appendStringInfo(remote_sql_stmt, "%s %s %s %s", remote_targetlist->data, remote_fromlist->data, groupby_clause->data, orderby_clause->data); + /* - * set the base_tlist for the RemoteQuery node being created, it's used to + * Set the base_tlist for the RemoteQuery node being created, it's used to * create the tuple descriptor for the result from RemoteQuery and rewrite * the Aggregates targetlist accept the results of the RemoteQuery. */ base_tlist = add_to_flat_tlist(NIL, get_tlist_exprs(local_tlist, true)); + /* - * create a dummy RTE for the remote query being created. Append the dummy + * Create a dummy RTE for the remote query being created. Append the dummy * range table entry to the range table. Note that this modifies the master * copy the caller passed us, otherwise e.g EXPLAIN VERBOSE will fail to * find the rte the Vars built below refer to. */ - /* cook up the reltupdesc using this base_tlist */ + + /* Cook up the reltupdesc using this base_tlist */ dummy_rte = makeNode(RangeTblEntry); dummy_rte->reltupdesc = ExecTypeFromTL(base_tlist, false); dummy_rte->rtekind = RTE_RELATION; - /* use a dummy relname... */ + + /* Use a dummy relname... */ dummy_rte->relname = "__FOREIGN_QUERY__"; dummy_rte->eref = makeAlias("__FOREIGN_QUERY__", NIL); - /* rest will be zeroed out in makeNode() */ + + /* Rest will be zeroed out in makeNode() */ root->parse->rtable = lappend(root->parse->rtable, dummy_rte); dummy_rtindex = list_length(root->parse->rtable); - /* build rest of the RemoteQuery node and the plan there */ + /* Build rest of the RemoteQuery node and the plan there */ remote_group_plan = &remote_group->scan.plan; - /* the join targetlist becomes this node's tlist */ + + /* The join targetlist becomes this node's tlist */ remote_group_plan->targetlist = base_tlist; remote_group_plan->lefttree = NULL; remote_group_plan->righttree = NULL; remote_group->scan.scanrelid = dummy_rtindex; remote_group->sql_statement = remote_sql_stmt->data; + /* set_plan_refs needs this later */ remote_group->base_tlist = base_tlist; remote_group->relname = "__FOREIGN_QUERY__"; remote_group->partitioned_replicated = remote_scan->partitioned_replicated; + /* * Only quals that can be pushed to the remote side are the ones in the having * clause. Till we work out how to handle having quals in XC, we don't have * any quals here. + * * PGXCTODO: the RemoteQuery node that was earlier the lefttree of Agg * node, may have local quals. In such case, we have to aggregate and group * at coordinator and can not push the grouping clause to the datanodes. Is * there a case in XC, where we can have local quals? */ - /* we actually need not worry about costs since this is the final plan */ + + /* We actually do not need to worry about costs since this is the final plan */ remote_group_plan->startup_cost = remote_scan->scan.plan.startup_cost; remote_group_plan->total_cost = remote_scan->scan.plan.total_cost; remote_group_plan->plan_rows = remote_scan->scan.plan.plan_rows; remote_group_plan->plan_width = remote_scan->scan.plan.plan_width; - /* modify the passed in Group plan according to the remote query we built */ /* + * Modify the passed in Group plan according to the remote query we built. * Materialization is always need for RemoteQuery in case we need to restart - * the scan + * the scan. */ group_plan->plan.lefttree = (Plan *) make_material(remote_group_plan); return (Plan *)group_plan; } /* - * locates the grouping clauses in the given target list. This is very similar + * pgxc_locate_grouping_columns + * Locates the grouping clauses in the given target list. This is very similar * to locate_grouping_columns except that there is only one target list to - * search into - * PGXCTODO: can we reuse locate_grouping_columns() instead of writing this - * function. But this function is optimized to search in the same target list. + * search into. + * PGXCTODO: Can we reuse locate_grouping_columns() instead of writing this + * function? But this function is optimized to search in the same target list. */ static void pgxc_locate_grouping_columns(PlannerInfo *root, List *tlist, diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index a068d47..07a49c0 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -82,8 +82,6 @@ extern ModifyTable *make_modifytable(CmdType operation, List *resultRelations, List *subplans, List *returningLists, List *rowMarks, int epqParam); extern bool is_projection_capable_plan(Plan *plan); -extern Plan *create_remotegroup_plan(PlannerInfo *root, Plan *local_plan); -extern Plan *create_remoteagg_plan(PlannerInfo *root, Plan *agg_plan); /* * prototypes for plan/initsplan.c @@ -139,5 +137,8 @@ extern Var *search_tlist_for_var(Var *var, List *jtlist); extern Plan *create_remoteinsert_plan(PlannerInfo *root, Plan *topplan); extern Plan *create_remoteupdate_plan(PlannerInfo *root, Plan *topplan); extern Plan *create_remotedelete_plan(PlannerInfo *root, Plan *topplan); +extern Plan *create_remotegroup_plan(PlannerInfo *root, Plan *local_plan); +extern Plan *create_remoteagg_plan(PlannerInfo *root, Plan *agg_plan); #endif + #endif /* PLANMAIN_H */ ----------------------------------------------------------------------- Summary of changes: src/backend/optimizer/plan/createplan.c | 190 ++++++++++++++++++------------- src/include/optimizer/planmain.h | 5 +- 2 files changed, 112 insertions(+), 83 deletions(-) hooks/post-receive -- Postgres-XC |
From: Ashutosh B. <ash...@us...> - 2011-06-01 09:06:31
|
Project "Postgres-XC". The branch, master has been updated via 4a81fdc503a2a7cccc7610cc54db94d4e2d1b857 (commit) from a39d99c7ac6888edc2b1c711839708e1eb7718cf (commit) - Log ----------------------------------------------------------------- commit 4a81fdc503a2a7cccc7610cc54db94d4e2d1b857 Author: Ashutosh Bapat <ash...@en...> Date: Wed Jun 1 14:24:51 2011 +0530 Push GROUP BY clause to the datanode for grouping without aggregation (i.e. when we choose Group plan for grouping. Group plan requires that the input to this node be sorted on columns/expression involved in GROUP BY clause. Hence construct a ORDER BY clause corresponding to GROUP BY clause. Also add the same information to the RemoteQuery node, so that the sorted data from different datanodes can be merged at the coordinator so as to feed the sorted output to the Group node. The number of rows resulting after grouping is lesser or equal to the qualifying rows from the relations/joins. Hence by pushing GROUP BY to the datanodes, we reduce (in rare cases keep same) the number of rows fetched from datanodes, thus saving bandwidth. The optimization works under following restrictions 1. Group plan has Sort->Result->Material->RemoteQuery nodes under it. Result and Material nodes are optional. 2. There is no having clause, ORDER BY clause, windowing clause, Distinct clause in the query. 3. Such grouping uses Sorting for Grouping. (direct implication of first restriction) The patch also adds more test to file xc_groupby. diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 4acef77..b73c08e 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -5054,7 +5054,7 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) } /* - * create_remotegrouping_plan + * create_remoteagg_plan * tries to see if the grouping and aggregates can be pushed down to the * datanodes. * Right now we can push with following restrictions @@ -5073,7 +5073,7 @@ create_remotedelete_plan(PlannerInfo *root, Plan *topplan) * node in case there are no local clauses. */ Plan * -create_remotegrouping_plan(PlannerInfo *root, Plan *local_plan) +create_remoteagg_plan(PlannerInfo *root, Plan *local_plan) { Query *query = root->parse; RemoteQuery *agg_left; @@ -5339,6 +5339,245 @@ create_remotegrouping_plan(PlannerInfo *root, Plan *local_plan) } /* + * create_remotegroup_plan + * given a Group plan, try to push as much of the query to the datanodes and + * build a Group plan to combiner the results across the datanodes. The Sort + * node under the Group plan is pushed down to RemoteQuery plan, since the + * combiner knows how to merge the results across datanodes in sorted manner. + * Hence there is no separate Sort node. + * + * This optimization is applied under following conditions + * 1. The scan plans under the Group->Sort node is RemoteQuery + * 2. There is not separate Sort, distinct, having clause in the query. + * + * PGXCTODO: we should lift up as many of these restrictions as possible or give + * reasons why those restrictions are needed. + */ +Plan * +create_remotegroup_plan(PlannerInfo *root, Plan *local_plan) +{ + Group *group_plan; + Query *query = root->parse; + Sort *sort_plan; + RemoteQuery *remote_scan; /* remote query in the passed in plan */ + RemoteQuery *remote_group; /* remote query after optimization */ + Plan *remote_group_plan; /* plan portion of remote_group */ + Plan *temp_plan; + List *local_tlist; /* target list of the local plan */ + List *temp_vars; /* temporarily hold the VARs */ + List *temp_vartlist; /* temporarity hold tlist of VARs */ + ListCell *temp; + StringInfo remote_targetlist = makeStringInfo();/* SELECT clause of remote query */ + StringInfo remote_sql_stmt = makeStringInfo(); + StringInfo groupby_clause = makeStringInfo(); /* remote query GROUP BY */ + StringInfo orderby_clause = makeStringInfo(); /* remote query ORDER BY */ + StringInfo remote_fromlist = makeStringInfo(); /* remote query FROM */ + StringInfo in_alias = makeStringInfo(); + Relids in_relids; + Index dummy_rtindex; + List *base_tlist; + RangeTblEntry *dummy_rte; + int cntCols; + + if (query->havingQual || + query->distinctClause || + query->sortClause || + query->hasWindowFuncs) + return local_plan; + + /* for now only for Group plans */ + Assert(IsA(local_plan, Group)); + group_plan = (Group *)local_plan; + remote_scan = NULL; + temp_plan = local_plan->lefttree; + /* + * we expect plan tree as Group->Sort->{Result}?->{Material}?->RemoteQuery, + * anything else is not handled right now. + */ + if (IsA(temp_plan, Sort)) + { + sort_plan = (Sort *)temp_plan; + temp_plan = temp_plan->lefttree; + } + if (IsA(temp_plan, Result)) + temp_plan = temp_plan->lefttree; + if (IsA(temp_plan, Material)) + temp_plan = temp_plan->lefttree; + if (IsA(temp_plan, RemoteQuery)) + remote_scan = (RemoteQuery *)temp_plan; + + if (!remote_scan || !sort_plan) + return local_plan; + + Assert(IsA(remote_scan, RemoteQuery)); + Assert(IsA(sort_plan, Sort)); + /* + * grouping_planner will add Sort node before Group node to sort the rows + * based on the columns in GROUP BY clause. Hence the columns in Sort and + * those in Group node in should be same. The columns are usually in the + * same order in both nodes, hence check the equality in order. If this + * condition fails, we can not handle this GROUP plan for now. + */ + if (sort_plan->numCols != group_plan->numCols) + return local_plan; + for (cntCols = 0; cntCols < group_plan->numCols; cntCols++) + { + if (sort_plan->sortColIdx[cntCols] != group_plan->grpColIdx[cntCols]) + return local_plan; + } + /* + * We are now ready to create the RemoteQuery node to push the query to + * datanode. + * 1. create a remote query node reflecting the query to be pushed to the + * datanode + * 2. Modify the Group node passed in, to accept the results sent by the + * datanodes and group them + */ + + local_tlist = local_plan->targetlist; + appendStringInfo(in_alias, "%s_%d", "group", root->rs_alias_index); + + /* find all the relations referenced by targetlist of Group node */ + temp_vars = pull_var_clause((Node *)local_tlist, PVC_REJECT_PLACEHOLDERS); + findReferencedVars(temp_vars, (Plan *)remote_scan, &temp_vartlist, &in_relids); + /* + * build partial RemoteQuery node to be used for creating the Select clause + * to be sent to the remote node. Rest of the node will be built later + */ + remote_group = makeNode(RemoteQuery); + /* + * Save information about the plan we are reducing. + * We may need this information later if more entries are added to it + * as part of the remote expression optimization. + */ + remote_group->remotejoin = false; + remote_group->inner_alias = pstrdup(in_alias->data); + remote_group->inner_reduce_level = remote_scan->reduce_level; + remote_group->inner_relids = in_relids; + remote_group->inner_statement = pstrdup(remote_scan->sql_statement); + remote_group->exec_nodes = remote_scan->exec_nodes; + /* don't forget to increment the index for the next time around! */ + remote_group->reduce_level = root->rs_alias_index++; + + /* generate the select clause of the remote query */ + appendStringInfoString(remote_targetlist, "SELECT"); + foreach (temp, local_tlist) + { + TargetEntry *tle = lfirst(temp); + Node *expr = (Node *)tle->expr; + + create_remote_expr(root, local_plan, remote_targetlist, expr, remote_group); + /* if this is not last target entry, add a comma with space */ + if (lnext(temp)) + appendStringInfoString(remote_targetlist, ","); + } + + /* generate the from clause of the remote query */ + appendStringInfo(remote_fromlist, "FROM (%s) %s", + remote_group->inner_statement, remote_group->inner_alias); + + /* + * generate group by clause for the remote query and recompute the group by + * column locations. We want the tuples from remote node to be ordered by + * the grouping columns so that ExecGroup can work without any modification, + * hence create a SimpleSort structure to be added to RemoteQuery (which + * will merge the sorted results and present to Group node in sorted + * manner). + */ + if (query->groupClause) + { + SimpleSort *remote_sort = makeNode(SimpleSort); + char *sep = ""; + + /* + * reuse the arrays allocated in sort_plan to create SimpleSort + * structure. sort_plan is useless henceforth. + */ + remote_sort->numCols = group_plan->numCols; + remote_sort->sortColIdx = sort_plan->sortColIdx; + remote_sort->sortOperators = sort_plan->sortOperators; + remote_sort->nullsFirst = sort_plan->nullsFirst; + + pgxc_locate_grouping_columns(root, local_tlist, group_plan->grpColIdx); + + appendStringInfoString(groupby_clause, "GROUP BY "); + appendStringInfoString(orderby_clause, "ORDER BY "); + for (cntCols = 0; cntCols < group_plan->numCols; cntCols++) + { + appendStringInfo(groupby_clause, "%s%d", sep, + group_plan->grpColIdx[cntCols]); + remote_sort->sortColIdx[cntCols] = group_plan->grpColIdx[cntCols]; + appendStringInfo(orderby_clause, "%s%d", sep, + remote_sort->sortColIdx[cntCols]); + sep = ", "; + } + remote_group->sort = remote_sort; + } + + /* generate the remote sql statement from the pieces */ + appendStringInfo(remote_sql_stmt, "%s %s %s %s", remote_targetlist->data, + remote_fromlist->data, groupby_clause->data, + orderby_clause->data); + /* + * set the base_tlist for the RemoteQuery node being created, it's used to + * create the tuple descriptor for the result from RemoteQuery and rewrite + * the Aggregates targetlist accept the results of the RemoteQuery. + */ + base_tlist = add_to_flat_tlist(NIL, get_tlist_exprs(local_tlist, true)); + /* + * create a dummy RTE for the remote query being created. Append the dummy + * range table entry to the range table. Note that this modifies the master + * copy the caller passed us, otherwise e.g EXPLAIN VERBOSE will fail to + * find the rte the Vars built below refer to. + */ + /* cook up the reltupdesc using this base_tlist */ + dummy_rte = makeNode(RangeTblEntry); + dummy_rte->reltupdesc = ExecTypeFromTL(base_tlist, false); + dummy_rte->rtekind = RTE_RELATION; + /* use a dummy relname... */ + dummy_rte->relname = "__FOREIGN_QUERY__"; + dummy_rte->eref = makeAlias("__FOREIGN_QUERY__", NIL); + /* rest will be zeroed out in makeNode() */ + root->parse->rtable = lappend(root->parse->rtable, dummy_rte); + dummy_rtindex = list_length(root->parse->rtable); + + /* build rest of the RemoteQuery node and the plan there */ + remote_group_plan = &remote_group->scan.plan; + /* the join targetlist becomes this node's tlist */ + remote_group_plan->targetlist = base_tlist; + remote_group_plan->lefttree = NULL; + remote_group_plan->righttree = NULL; + remote_group->scan.scanrelid = dummy_rtindex; + remote_group->sql_statement = remote_sql_stmt->data; + /* set_plan_refs needs this later */ + remote_group->base_tlist = base_tlist; + remote_group->relname = "__FOREIGN_QUERY__"; + remote_group->partitioned_replicated = remote_scan->partitioned_replicated; + /* + * Only quals that can be pushed to the remote side are the ones in the having + * clause. Till we work out how to handle having quals in XC, we don't have + * any quals here. + * PGXCTODO: the RemoteQuery node that was earlier the lefttree of Agg + * node, may have local quals. In such case, we have to aggregate and group + * at coordinator and can not push the grouping clause to the datanodes. Is + * there a case in XC, where we can have local quals? + */ + /* we actually need not worry about costs since this is the final plan */ + remote_group_plan->startup_cost = remote_scan->scan.plan.startup_cost; + remote_group_plan->total_cost = remote_scan->scan.plan.total_cost; + remote_group_plan->plan_rows = remote_scan->scan.plan.plan_rows; + remote_group_plan->plan_width = remote_scan->scan.plan.plan_width; + + /* modify the passed in Group plan according to the remote query we built */ + /* + * Materialization is always need for RemoteQuery in case we need to restart + * the scan + */ + group_plan->plan.lefttree = (Plan *) make_material(remote_group_plan); + return (Plan *)group_plan; +} + +/* * locates the grouping clauses in the given target list. This is very similar * to locate_grouping_columns except that there is only one target list to * search into diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index aa629f6..182a18f 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -1336,10 +1336,13 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) result_plan); #ifdef PGXC /* - * can we push any clauses to the remote node? try doing that + * Grouping will certainly not increase the number of rows + * coordinator fetches from datanode, in fact it's expected to + * reduce the number drastically. Hence, try pushing GROUP BY + * clauses and aggregates to the datanode, thus saving bandwidth. */ if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) - result_plan = create_remotegrouping_plan(root, result_plan); + result_plan = create_remoteagg_plan(root, result_plan); #endif /* PGXC */ /* Hashed aggregation produces randomly-ordered results */ current_pathkeys = NIL; @@ -1412,7 +1415,16 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) extract_grouping_ops(parse->groupClause), dNumGroups, result_plan); - /* The Group node won't change sort ordering */ +#ifdef PGXC + /* + * Grouping will certainly not increase the number of rows + * coordinator fetches from datanode, in fact it's expected to + * reduce the number drastically. Hence, try pushing GROUP BY + * clauses and aggregates to the datanode, thus saving bandwidth. + */ + if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) + result_plan = create_remotegroup_plan(root, result_plan); +#endif /* PGXC */ } else if (root->hasHavingQual) { diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index 71aba2b..a068d47 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -82,7 +82,8 @@ extern ModifyTable *make_modifytable(CmdType operation, List *resultRelations, List *subplans, List *returningLists, List *rowMarks, int epqParam); extern bool is_projection_capable_plan(Plan *plan); -extern Plan *create_remotegrouping_plan(PlannerInfo *root, Plan *agg_plan); +extern Plan *create_remotegroup_plan(PlannerInfo *root, Plan *local_plan); +extern Plan *create_remoteagg_plan(PlannerInfo *root, Plan *agg_plan); /* * prototypes for plan/initsplan.c diff --git a/src/test/regress/expected/xc_groupby.out b/src/test/regress/expected/xc_groupby.out index 672084a..e403e37 100644 --- a/src/test/regress/expected/xc_groupby.out +++ b/src/test/regress/expected/xc_groupby.out @@ -66,7 +66,8 @@ explain verbose select sum(y) from (select sum(val) y, val2%2 x from tab1 group Output: tab1.val, tab1.val2 (8 rows) --- group by without aggregate, just like distinct? +-- group by without aggregate +set enable_hashagg to off; select val2 from tab1 group by val2; val2 ------ @@ -78,7 +79,7 @@ select val2 from tab1 group by val2; explain verbose select val2 from tab1 group by val2; QUERY PLAN ---------------------------------------------------------------------------------- - HashAggregate (cost=1.02..1.03 rows=1 width=4) + Group (cost=1.02..1.03 rows=1 width=4) Output: tab1.val2 -> Materialize (cost=0.00..0.00 rows=0 width=0) Output: tab1.val2 @@ -86,6 +87,95 @@ explain verbose select val2 from tab1 group by val2; Output: tab1.val2 (6 rows) +select val + val2 from tab1 group by val + val2; + ?column? +---------- + 2 + 3 + 4 + 7 + 8 + 9 +(6 rows) + +explain verbose select val + val2 from tab1 group by val + val2; + QUERY PLAN +---------------------------------------------------------------------------------- + Group (cost=1.03..1.04 rows=1 width=8) + Output: ((tab1.val + tab1.val2)) + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: ((tab1.val + tab1.val2)) + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=8) + Output: (tab1.val + tab1.val2) +(6 rows) + +select val + val2, val, val2 from tab1 group by val, val2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 4 | 1 | 3 + 3 | 2 | 1 + 4 | 2 | 2 + 4 | 3 | 1 + 7 | 4 | 3 + 8 | 6 | 2 + 9 | 6 | 3 +(8 rows) + +explain verbose select val + val2, val, val2 from tab1 group by val, val2; + QUERY PLAN +---------------------------------------------------------------------------------- + Group (cost=1.02..1.04 rows=1 width=8) + Output: ((tab1.val + tab1.val2)), tab1.val, tab1.val2 + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: ((tab1.val + tab1.val2)), tab1.val, tab1.val2 + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1000 width=8) + Output: (tab1.val + tab1.val2), tab1.val, tab1.val2 +(6 rows) + +select tab1.val + tab2.val2, tab1.val, tab2.val2 from tab1, tab2 where tab1.val = tab2.val group by tab1.val, tab2.val2; + ?column? | val | val2 +----------+-----+------ + 2 | 1 | 1 + 6 | 2 | 4 + 5 | 3 | 2 + 7 | 3 | 4 + 5 | 4 | 1 + 6 | 4 | 2 +(6 rows) + +explain verbose select tab1.val + tab2.val2, tab1.val, tab2.val2 from tab1, tab2 where tab1.val = tab2.val group by tab1.val, tab2.val2; + QUERY PLAN +------------------------------------------------------------------------------- + Group (cost=0.01..0.02 rows=1 width=0) + Output: ((tab1.val + tab2.val2)), tab1.val, tab2.val2 + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: ((tab1.val + tab2.val2)), tab1.val, tab2.val2 + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1 width=4) + Output: (tab1.val + tab2.val2), tab1.val, tab2.val2 +(6 rows) + +select tab1.val + tab2.val2 from tab1, tab2 where tab1.val = tab2.val group by tab1.val + tab2.val2; + ?column? +---------- + 2 + 5 + 6 + 7 +(4 rows) + +explain verbose select tab1.val + tab2.val2 from tab1, tab2 where tab1.val = tab2.val group by tab1.val + tab2.val2; + QUERY PLAN +------------------------------------------------------------------------------- + Group (cost=0.01..0.02 rows=1 width=0) + Output: ((tab1.val + tab2.val2)) + -> Materialize (cost=0.00..0.00 rows=0 width=0) + Output: ((tab1.val + tab2.val2)) + -> Data Node Scan (Node Count [2]) (cost=0.00..1.01 rows=1 width=4) + Output: (tab1.val + tab2.val2) +(6 rows) + +reset enable_hashagg; -- group by with aggregates in expression select count(*) + sum(val) + avg(val), val2 from tab1 group by val2; ?column? | val2 @@ -210,7 +300,8 @@ explain verbose select sum(y) from (select sum(val) y, val2%2 x from tab1 group Output: tab1.val, tab1.val2 (8 rows) --- group by without aggregate, just like distinct? +-- group by without aggregate +set enable_hashagg to off; select val2 from tab1 group by val2; val2 ------ @@ -222,7 +313,7 @@ select val2 from tab1 group by val2; explain verbose select val2 from tab1 group by val2; QUERY PLAN ---------------------------------------------------------------------------------- - HashAggregate (cost=1.02..1.03 rows=1 width=4) + Group (cost=1.02..1.03 rows=1 width=4) Output: tab1.val2 -> Materialize (cost=0.00..0.00 rows=0 width=0) Output: tab1.val2 @@ -230,6 +321,95 @@ explain verbose select val2 from tab1 group by val2; Output: tab1.val2 (6 rows) +select val + val2 from tab1 group by val + val2; + ?column? +---------- + 2 + 3 + 4 + 7 + |
From: Koichi S. <koi...@us...> - 2011-06-01 08:49:22
|
Project "Postgres-XC". The branch, ha_support has been updated via 612265336a7d8bc0666983f8b4d6c165aa15efb5 (commit) from 029f58652a406ed58ee392b317b358dc7841353f (commit) - Log ----------------------------------------------------------------- commit 612265336a7d8bc0666983f8b4d6c165aa15efb5 Author: Koichi Suzuki <koi...@gm...> Date: Wed Jun 1 17:41:42 2011 +0900 This commit corrects some problem related to node registration from gtm-proxy and gtm-standby. Problems scatters in various files and the causes are: 1) Some character strings obtained by pq_getmsgbytes() were left unterminated. Cleaned up most of them but there may be some more related issues not directly related to pq_getmsgbytes(). So far, as much as tested in five server environement, it works correctly. 2) Corrected node_get_local_addr() to obtain "local" address information. Was obtaining "remote" address instead. Modifed fies are: modified: src/gtm/client/gtm_client.c modified: src/gtm/main/gtm_txn.c modified: src/gtm/main/main.c modified: src/gtm/proxy/proxy_main.c modified: src/gtm/recovery/register.c diff --git a/src/gtm/client/gtm_client.c b/src/gtm/client/gtm_client.c index d4b98f3..c9dea5b 100644 --- a/src/gtm/client/gtm_client.c +++ b/src/gtm/client/gtm_client.c @@ -1182,31 +1182,31 @@ send_failed: char * node_get_local_addr(GTM_Conn *conn, char *buf, size_t buflen, int *rc) { - char remote_host[NI_MAXHOST]; - char remote_port[NI_MAXSERV]; + char local_host[NI_MAXHOST]; + char local_port[NI_MAXSERV]; *rc = 0; - memset(remote_host, 0, sizeof(remote_host)); - memset(remote_port, 0, sizeof(remote_port)); + memset(local_host, 0, sizeof(local_host)); + memset(local_port, 0, sizeof(local_port)); memset(buf, 0, buflen); if (conn->remote_type != PGXC_NODE_GTM_PROXY) { - if (gtm_getnameinfo_all(&conn->raddr.addr, conn->raddr.salen, - remote_host, sizeof(remote_host), - remote_port, sizeof(remote_port), + if (gtm_getnameinfo_all(&conn->laddr.addr, conn->laddr.salen, + local_host, sizeof(local_host), + local_port, sizeof(local_port), NI_NUMERICSERV)) { - *rc = gtm_getnameinfo_all(&conn->raddr.addr, conn->raddr.salen, - remote_host, sizeof(remote_host), - remote_port, sizeof(remote_port), + *rc = gtm_getnameinfo_all(&conn->laddr.addr, conn->laddr.salen, + local_host, sizeof(local_host), + local_port, sizeof(local_port), NI_NUMERICHOST | NI_NUMERICSERV); } } - if ( remote_host[0]!='\0' ) - strncpy(buf, remote_host, buflen); + if ( local_host[0]!='\0' ) + strncpy(buf, local_host, buflen); return buf; } diff --git a/src/gtm/main/gtm_txn.c b/src/gtm/main/gtm_txn.c index 0c6c238..809cc06 100644 --- a/src/gtm/main/gtm_txn.c +++ b/src/gtm/main/gtm_txn.c @@ -1514,7 +1514,7 @@ void ProcessGetGIDDataTransactionCommand(Port *myport, StringInfo message) { StringInfoData buf; - char *gid; + char gid[1024]; int gidlen; GTM_IsolationLevel txn_isolation_level; bool txn_read_only; @@ -1531,7 +1531,8 @@ ProcessGetGIDDataTransactionCommand(Port *myport, StringInfo message) /* receive GID */ gidlen = pq_getmsgint(message, sizeof (GTM_StrLen)); - gid = (char *)pq_getmsgbytes(message, gidlen); + memcpy(gid, (char *)pq_getmsgbytes(message, gidlen), gidlen); + gid[gidlen] = '\0'; pq_getmsgend(message); @@ -1971,7 +1972,7 @@ ProcessStartPreparedTransactionCommand(Port *myport, StringInfo message) PGXC_NodeId *coordinators = NULL; PGXC_NodeId *datanodes = NULL; MemoryContext oldContext; - char *gid; + char gid[1024]; isgxid = pq_getmsgbyte(message); @@ -1997,7 +1998,8 @@ ProcessStartPreparedTransactionCommand(Port *myport, StringInfo message) /* get GID */ gidlen = pq_getmsgint(message, sizeof (GTM_StrLen)); - gid = (char *)pq_getmsgbytes(message, gidlen); + memcpy(gid, (char *)pq_getmsgbytes(message, gidlen), gidlen); + gid[gidlen] = '\0'; /* Get Datanode Count Data */ datanodecnt = pq_getmsgint(message, 4); diff --git a/src/gtm/main/main.c b/src/gtm/main/main.c index 66096f5..96c90ed 100644 --- a/src/gtm/main/main.c +++ b/src/gtm/main/main.c @@ -421,7 +421,7 @@ main(int argc, char *argv[]) if ( gtm_is_standby() ) { - if ( !gtm_standby_register_self(node_num, GTMPortNumber, GTMDataDir) ) + if ( !gtm_standby_register_self(node_num, GTMPortNumber, GTMDataDir) ) { elog(ERROR, "Failed to register myself on the active-GTM as a GTM node."); exit(1); diff --git a/src/gtm/proxy/proxy_main.c b/src/gtm/proxy/proxy_main.c index 183a877..e0b5ca9 100644 --- a/src/gtm/proxy/proxy_main.c +++ b/src/gtm/proxy/proxy_main.c @@ -2847,10 +2847,13 @@ RegisterProxy(void) gtmpqPutInt(MSG_NODE_REGISTER, sizeof (GTM_MessageType), master_conn) || gtmpqPutnchar((char *)&type, sizeof(GTM_PGXCNodeType), master_conn) || gtmpqPutnchar((char *)>MProxyID, sizeof(GTM_PGXCNodeId), master_conn) || /* nodenum */ + gtmpqPutInt((int)strlen(ListenAddresses), sizeof(int), master_conn) || + gtmpqPutnchar(ListenAddresses, (int)strlen(ListenAddresses), master_conn) || gtmpqPutnchar((char *)&port, sizeof(GTM_PGXCNodePort), master_conn) || gtmpqPutnchar((char *)&proxynum, sizeof(GTM_PGXCNodeId), master_conn) || - gtmpqPutInt(strlen(GTMProxyDataDir), 4, master_conn) || - gtmpqPutnchar(GTMProxyDataDir, strlen(GTMProxyDataDir), master_conn)) + gtmpqPutInt((int)strlen(GTMProxyDataDir), 4, master_conn) || + gtmpqPutnchar(GTMProxyDataDir, strlen(GTMProxyDataDir), master_conn)|| + gtmpqPutInt(NODE_CONNECTED, sizeof(GTM_PGXCNodeStatus), master_conn)) goto failed; /* Finish the message. */ diff --git a/src/gtm/recovery/register.c b/src/gtm/recovery/register.c index 6fa11ff..f1a61fd 100644 --- a/src/gtm/recovery/register.c +++ b/src/gtm/recovery/register.c @@ -309,12 +309,13 @@ pgxcnode_copy_char(const char *str) * contextes. */ retstr = (char *) MemoryContextAlloc(TopMostMemoryContext, - strlen(str)); + strlen(str) + 1); if (retstr == NULL) ereport(ERROR, (ENOMEM, errmsg("Out of memory"))); memcpy(retstr, str, strlen(str)); + retstr[strlen(str)] = '\0'; return retstr; } @@ -385,8 +386,10 @@ Recovery_PGXCNodeRegister(GTM_PGXCNodeType type, nodeinfo->status = status; nodeinfo->socket = socket; - elog(LOG, "type=%d, nodenum=%d, port=%d, datafolder=%s, ipaddress=%s, status=%d", + elog(LOG, "Recovery_PGXCNodeRegister Request info: type=%d, nodenum=%d, port=%d, datafolder=%s, ipaddress=%s, status=%d", type, nodenum, port, datafolder, ipaddress, status); + elog(LOG, "Recovery_PGXCNodeRegister Node info: type=%d, nodenum=%d, port=%d, datafolder=%s, ipaddress=%s, status=%d", + nodeinfo->type, nodeinfo->nodenum, nodeinfo->port, nodeinfo->datafolder, nodeinfo->ipaddress, nodeinfo->status); /* Add PGXC Node Info to the global hash table */ errcode = pgxcnode_add_info(nodeinfo); @@ -413,7 +416,7 @@ ProcessPGXCNodeRegister(Port *myport, StringInfo message) GTM_PGXCNodePort port; char remote_host[NI_MAXHOST]; char remote_port[NI_MAXSERV]; - char *datafolder; + char datafolder[NI_MAXHOST]; char *ipaddress; MemoryContext oldContext; int strlen; @@ -425,7 +428,7 @@ ProcessPGXCNodeRegister(Port *myport, StringInfo message) remote_port[0] = '\0'; memset(remote_host, 0, sizeof(remote_host)); - +#if 0 if (myport->remote_type != PGXC_NODE_GTM_PROXY) { if (gtm_getnameinfo_all(&myport->raddr.addr, myport->raddr.salen, @@ -443,7 +446,7 @@ ProcessPGXCNodeRegister(Port *myport, StringInfo message) (errmsg_internal("gtm_getnameinfo_all() failed"))); } } - +#endif /* Read Node Type */ memcpy(&type, pq_getmsgbytes(message, sizeof (GTM_PGXCNodeType)), sizeof (GTM_PGXCNodeType)); @@ -474,7 +477,7 @@ ProcessPGXCNodeRegister(Port *myport, StringInfo message) else ipaddress = remote_host; #else - strlen = pq_getmsgint(message, sizeof (GTM_StrLen)); + strlen = pq_getmsgint(message, sizeof (int)); memcpy(remote_host, (char *)pq_getmsgbytes(message, strlen), strlen); remote_host[strlen] = '\0'; ipaddress = remote_host; @@ -488,14 +491,19 @@ ProcessPGXCNodeRegister(Port *myport, StringInfo message) /* Read Proxy ID number (0 if no proxy used) */ memcpy(&proxynum, pq_getmsgbytes(message, sizeof (GTM_PGXCNodeId)), sizeof (GTM_PGXCNodeId)); - elog(LOG, "ProcessPGXCNodeRegister: ipaddress = %s", ipaddress); /* * Finish by reading Data Folder (length and then string) */ strlen = pq_getmsgint(message, sizeof (GTM_StrLen)); + /* The next code was incorrect. Fixed. */ +#if 1 + memcpy(datafolder, (char *)pq_getmsgbytes(message, strlen), strlen); + datafolder[strlen] = '\0'; +#else datafolder = (char *)pq_getmsgbytes(message, strlen); +#endif status = pq_getmsgint(message, sizeof (GTM_PGXCNodeStatus)); ----------------------------------------------------------------------- Summary of changes: src/gtm/client/gtm_client.c | 24 ++++++++++++------------ src/gtm/main/gtm_txn.c | 10 ++++++---- src/gtm/main/main.c | 2 +- src/gtm/proxy/proxy_main.c | 7 +++++-- src/gtm/recovery/register.c | 22 +++++++++++++++------- 5 files changed, 39 insertions(+), 26 deletions(-) hooks/post-receive -- Postgres-XC |