From: Alexander Korotkov Date: Thu, 6 Jun 2024 10:44:34 +0000 (+0300) Subject: Restore preprocess_groupclause() X-Git-Tag: REL_17_BETA2~88 X-Git-Url: http://git.postgresql.org/gitweb/?a=commitdiff_plain;h=505c008ca37c4f6f2fffcde370b5d8354c4d4dc3;p=postgresql.git Restore preprocess_groupclause() 0452b461bc made optimizer explore alternative orderings of group-by pathkeys. It eliminated preprocess_groupclause(), which was intended to match items between GROUP BY and ORDER BY. Instead, get_useful_group_keys_orderings() function generates orderings of GROUP BY elements at the time of grouping paths generation. The get_useful_group_keys_orderings() function takes into account 3 orderings of GROUP BY pathkeys and clauses: original order as written in GROUP BY, matching ORDER BY clauses as much as possible, and matching the input path as much as possible. Given that even before 0452b461b, preprocess_groupclause() could change the original order of GROUP BY clauses we don't need to consider it apart from ordering matching ORDER BY clauses. This commit restores preprocess_groupclause() to provide an ordering of GROUP BY elements matching ORDER BY before generation of paths. The new version of preprocess_groupclause() takes into account an incremental sort. The get_useful_group_keys_orderings() function now takes into 2 orderings of GROUP BY elements: the order generated preprocess_groupclause() and the order matching the input path as much as possible. Discussion: https://postgr.es/m/CAPpHfdvyWLMGwvxaf%3D7KAp-z-4mxbSH8ti2f6mNOQv5metZFzg%40mail.gmail.com Author: Alexander Korotkov Reviewed-by: Andrei Lepikhov, Pavel Borisov --- diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 0a25b6384bb..416fc4e240b 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -447,26 +447,6 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys, return n; } -/* - * pathkeys_are_duplicate - * Check if give pathkeys are already contained the list of - * GroupByOrdering's. - */ -static bool -pathkeys_are_duplicate(List *infos, List *pathkeys) -{ - ListCell *lc; - - foreach(lc, infos) - { - GroupByOrdering *info = lfirst_node(GroupByOrdering, lc); - - if (compare_pathkeys(pathkeys, info->pathkeys) == PATHKEYS_EQUAL) - return true; - } - return false; -} - /* * get_useful_group_keys_orderings * Determine which orderings of GROUP BY keys are potentially interesting. @@ -475,11 +455,11 @@ pathkeys_are_duplicate(List *infos, List *pathkeys) * ordering of GROUP BY keys. Each item stores pathkeys and clauses in the * matching order. * - * The function considers (and keeps) multiple GROUP BY orderings: + * The function considers (and keeps) following GROUP BY orderings: * - * - the original ordering, as specified by the GROUP BY clause, - * - GROUP BY keys reordered to match 'path' ordering (as much as possible), - * - GROUP BY keys to match target ORDER BY clause (as much as possible). + * - GROUP BY keys as ordered by preprocess_groupclause() to match target + * ORDER BY clause (as much as possible), + * - GROUP BY keys reordered to match 'path' ordering (as much as possible). */ List * get_useful_group_keys_orderings(PlannerInfo *root, Path *path) @@ -526,32 +506,7 @@ get_useful_group_keys_orderings(PlannerInfo *root, Path *path) if (n > 0 && (enable_incremental_sort || n == root->num_groupby_pathkeys) && - !pathkeys_are_duplicate(infos, pathkeys)) - { - info = makeNode(GroupByOrdering); - info->pathkeys = pathkeys; - info->clauses = clauses; - - infos = lappend(infos, info); - } - } - - /* - * Try reordering pathkeys to minimize the sort cost (this time consider - * the ORDER BY clause). - */ - if (root->sort_pathkeys && - !pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys)) - { - int n; - - n = group_keys_reorder_by_pathkeys(root->sort_pathkeys, &pathkeys, - &clauses, - root->num_groupby_pathkeys); - - if (n > 0 && - (enable_incremental_sort || n == list_length(root->sort_pathkeys)) && - !pathkeys_are_duplicate(infos, pathkeys)) + compare_pathkeys(pathkeys, root->group_pathkeys) != PATHKEYS_EQUAL) { info = makeNode(GroupByOrdering); info->pathkeys = pathkeys; diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 084c6796d03..4711f912390 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -137,7 +137,7 @@ static double preprocess_limit(PlannerInfo *root, double tuple_fraction, int64 *offset_est, int64 *count_est); static void remove_useless_groupby_columns(PlannerInfo *root); -static List *groupclause_apply_groupingset(PlannerInfo *root, List *force); +static List *preprocess_groupclause(PlannerInfo *root, List *force); static List *extract_rollup_sets(List *groupingSets); static List *reorder_grouping_sets(List *groupingSets, List *sortclause); static void standard_qp_callback(PlannerInfo *root, void *extra); @@ -1422,7 +1422,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction, else if (parse->groupClause) { /* Preprocess regular GROUP BY clause, if any */ - root->processed_groupClause = list_copy(parse->groupClause); + root->processed_groupClause = preprocess_groupclause(root, NIL); /* Remove any redundant GROUP BY columns */ remove_useless_groupby_columns(root); } @@ -2169,7 +2169,7 @@ preprocess_grouping_sets(PlannerInfo *root) * The groupClauses for hashed grouping sets are built later on.) */ if (gs->set) - rollup->groupClause = groupclause_apply_groupingset(root, gs->set); + rollup->groupClause = preprocess_groupclause(root, gs->set); else rollup->groupClause = NIL; @@ -2821,24 +2821,106 @@ remove_useless_groupby_columns(PlannerInfo *root) } /* - * groupclause_apply_groupingset - * Apply the order of GROUP BY clauses defined by grouping sets. Items - * not in the grouping set are skipped. + * preprocess_groupclause - do preparatory work on GROUP BY clause + * + * The idea here is to adjust the ordering of the GROUP BY elements + * (which in itself is semantically insignificant) to match ORDER BY, + * thereby allowing a single sort operation to both implement the ORDER BY + * requirement and set up for a Unique step that implements GROUP BY. + * We also consider partial match between GROUP BY and ORDER BY elements, + * which could allow to implement ORDER BY using the incremental sort. + * + * We also consider other orderings of the GROUP BY elements, which could + * match the sort ordering of other possible plans (eg an indexscan) and + * thereby reduce cost. This is implemented during the generation of grouping + * paths. See get_useful_group_keys_orderings() for details. + * + * Note: we need no comparable processing of the distinctClause because + * the parser already enforced that that matches ORDER BY. + * + * Note: we return a fresh List, but its elements are the same + * SortGroupClauses appearing in parse->groupClause. This is important + * because later processing may modify the processed_groupClause list. + * + * For grouping sets, the order of items is instead forced to agree with that + * of the grouping set (and items not in the grouping set are skipped). The + * work of sorting the order of grouping set elements to match the ORDER BY if + * possible is done elsewhere. */ static List * -groupclause_apply_groupingset(PlannerInfo *root, List *gset) +preprocess_groupclause(PlannerInfo *root, List *force) { Query *parse = root->parse; List *new_groupclause = NIL; ListCell *sl; + ListCell *gl; - foreach(sl, gset) + /* For grouping sets, we need to force the ordering */ + if (force) { - Index ref = lfirst_int(sl); - SortGroupClause *cl = get_sortgroupref_clause(ref, parse->groupClause); + foreach(sl, force) + { + Index ref = lfirst_int(sl); + SortGroupClause *cl = get_sortgroupref_clause(ref, parse->groupClause); + + new_groupclause = lappend(new_groupclause, cl); + } - new_groupclause = lappend(new_groupclause, cl); + return new_groupclause; } + + /* If no ORDER BY, nothing useful to do here */ + if (parse->sortClause == NIL) + return list_copy(parse->groupClause); + + /* + * Scan the ORDER BY clause and construct a list of matching GROUP BY + * items, but only as far as we can make a matching prefix. + * + * This code assumes that the sortClause contains no duplicate items. + */ + foreach(sl, parse->sortClause) + { + SortGroupClause *sc = lfirst_node(SortGroupClause, sl); + + foreach(gl, parse->groupClause) + { + SortGroupClause *gc = lfirst_node(SortGroupClause, gl); + + if (equal(gc, sc)) + { + new_groupclause = lappend(new_groupclause, gc); + break; + } + } + if (gl == NULL) + break; /* no match, so stop scanning */ + } + + + /* If no match at all, no point in reordering GROUP BY */ + if (new_groupclause == NIL) + return list_copy(parse->groupClause); + + /* + * Add any remaining GROUP BY items to the new list. We don't require a + * complete match, because even partial match allows ORDER BY to be + * implemented using incremental sort. Also, give up if there are any + * non-sortable GROUP BY items, since then there's no hope anyway. + */ + foreach(gl, parse->groupClause) + { + SortGroupClause *gc = lfirst_node(SortGroupClause, gl); + + if (list_member_ptr(new_groupclause, gc)) + continue; /* it matched an ORDER BY item */ + if (!OidIsValid(gc->sortop)) /* give up, GROUP BY can't be sorted */ + return list_copy(parse->groupClause); + new_groupclause = lappend(new_groupclause, gc); + } + + /* Success --- install the rearranged GROUP BY list */ + Assert(list_length(parse->groupClause) == list_length(new_groupclause)); return new_groupclause; } @@ -4170,7 +4252,7 @@ consider_groupingsets_paths(PlannerInfo *root, { rollup = makeNode(RollupData); - rollup->groupClause = groupclause_apply_groupingset(root, gset); + rollup->groupClause = preprocess_groupclause(root, gset); rollup->gsets_data = list_make1(gs); rollup->gsets = remap_to_groupclause_idx(rollup->groupClause, rollup->gsets_data, @@ -4359,7 +4441,7 @@ consider_groupingsets_paths(PlannerInfo *root, Assert(gs->set != NIL); - rollup->groupClause = groupclause_apply_groupingset(root, gs->set); + rollup->groupClause = preprocess_groupclause(root, gs->set); rollup->gsets_data = list_make1(gs); rollup->gsets = remap_to_groupclause_idx(rollup->groupClause, rollup->gsets_data, diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 78489398294..2ba297c1172 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -426,7 +426,11 @@ struct PlannerInfo * items to be proven redundant, implying that there is only one group * containing all the query's rows. Hence, if you want to check whether * GROUP BY was specified, test for nonempty parse->groupClause, not for - * nonempty processed_groupClause. + * nonempty processed_groupClause. Optimizer chooses specific order of + * group-by clauses during the upper paths generation process, attempting + * to use different strategies to minimize number of sorts or engage + * incremental sort. See preprocess_groupclause() and + * get_useful_group_keys_orderings() for details. * * Currently, when grouping sets are specified we do not attempt to * optimize the groupClause, so that processed_groupClause will be diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out index 1b900fddf8e..5f2c0cf5786 100644 --- a/src/test/regress/expected/partition_aggregate.out +++ b/src/test/regress/expected/partition_aggregate.out @@ -901,15 +901,15 @@ SELECT a, c, sum(b), avg(c), count(*) FROM pagg_tab_m GROUP BY (a+b)/2, 2, 1 HAV Sort Key: pagg_tab_m.a, pagg_tab_m.c, (sum(pagg_tab_m.b)) -> Append -> HashAggregate - Group Key: ((pagg_tab_m.a + pagg_tab_m.b) / 2), pagg_tab_m.c, pagg_tab_m.a + Group Key: pagg_tab_m.a, pagg_tab_m.c, ((pagg_tab_m.a + pagg_tab_m.b) / 2) Filter: ((sum(pagg_tab_m.b) = 50) AND (avg(pagg_tab_m.c) > '25'::numeric)) -> Seq Scan on pagg_tab_m_p1 pagg_tab_m -> HashAggregate - Group Key: ((pagg_tab_m_1.a + pagg_tab_m_1.b) / 2), pagg_tab_m_1.c, pagg_tab_m_1.a + Group Key: pagg_tab_m_1.a, pagg_tab_m_1.c, ((pagg_tab_m_1.a + pagg_tab_m_1.b) / 2) Filter: ((sum(pagg_tab_m_1.b) = 50) AND (avg(pagg_tab_m_1.c) > '25'::numeric)) -> Seq Scan on pagg_tab_m_p2 pagg_tab_m_1 -> HashAggregate - Group Key: ((pagg_tab_m_2.a + pagg_tab_m_2.b) / 2), pagg_tab_m_2.c, pagg_tab_m_2.a + Group Key: pagg_tab_m_2.a, pagg_tab_m_2.c, ((pagg_tab_m_2.a + pagg_tab_m_2.b) / 2) Filter: ((sum(pagg_tab_m_2.b) = 50) AND (avg(pagg_tab_m_2.c) > '25'::numeric)) -> Seq Scan on pagg_tab_m_p3 pagg_tab_m_2 (15 rows)