Restore preprocess_groupclause()

author Alexander Korotkov <[email protected]>

Thu, 6 Jun 2024 10:44:34 +0000 (13:44 +0300)

committer Alexander Korotkov <[email protected]>

Thu, 6 Jun 2024 10:44:34 +0000 (13:44 +0300)
author Alexander Korotkov <[email protected]>
Thu, 6 Jun 2024 10:44:34 +0000 (13:44 +0300)
committer Alexander Korotkov <[email protected]>
Thu, 6 Jun 2024 10:44:34 +0000 (13:44 +0300)
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c

index 0a25b6384bbb0a4d4ba33023a70c2e0d6008fe4f..416fc4e240bb7195d38b3edf51e917370a8ab3a3 100644 (file)
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -447,26 +447,6 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys,
     return n;
  }
  
-/*
- * pathkeys_are_duplicate
- *     Check if give pathkeys are already contained the list of
- *     GroupByOrdering's.
- */
-static bool
-pathkeys_are_duplicate(List *infos, List *pathkeys)
-{
-   ListCell   *lc;
-
-   foreach(lc, infos)
-   {
-       GroupByOrdering *info = lfirst_node(GroupByOrdering, lc);
-
-       if (compare_pathkeys(pathkeys, info->pathkeys) == PATHKEYS_EQUAL)
-           return true;
-   }
-   return false;
-}
-
  /*
   * get_useful_group_keys_orderings
   *     Determine which orderings of GROUP BY keys are potentially interesting.
@@ -475,11 +455,11 @@ pathkeys_are_duplicate(List *infos, List *pathkeys)
   * ordering of GROUP BY keys.  Each item stores pathkeys and clauses in the
   * matching order.
   *
- * The function considers (and keeps) multiple GROUP BY orderings:
+ * The function considers (and keeps) following GROUP BY orderings:
   *
- * - the original ordering, as specified by the GROUP BY clause,
- * - GROUP BY keys reordered to match 'path' ordering (as much as possible),
- * - GROUP BY keys to match target ORDER BY clause (as much as possible).
+ * - GROUP BY keys as ordered by preprocess_groupclause() to match target
+ *   ORDER BY clause (as much as possible),
+ * - GROUP BY keys reordered to match 'path' ordering (as much as possible).
   */
  List *
  get_useful_group_keys_orderings(PlannerInfo *root, Path *path)
@@ -526,32 +506,7 @@ get_useful_group_keys_orderings(PlannerInfo *root, Path *path)
  
         if (n > 0 &&
             (enable_incremental_sort || n == root->num_groupby_pathkeys) &&
-           !pathkeys_are_duplicate(infos, pathkeys))
-       {
-           info = makeNode(GroupByOrdering);
-           info->pathkeys = pathkeys;
-           info->clauses = clauses;
-
-           infos = lappend(infos, info);
-       }
-   }
-
-   /*
-    * Try reordering pathkeys to minimize the sort cost (this time consider
-    * the ORDER BY clause).
-    */
-   if (root->sort_pathkeys &&
-       !pathkeys_contained_in(root->sort_pathkeys, root->group_pathkeys))
-   {
-       int         n;
-
-       n = group_keys_reorder_by_pathkeys(root->sort_pathkeys, &pathkeys,
-                                          &clauses,
-                                          root->num_groupby_pathkeys);
-
-       if (n > 0 &&
-           (enable_incremental_sort || n == list_length(root->sort_pathkeys)) &&
-           !pathkeys_are_duplicate(infos, pathkeys))
+           compare_pathkeys(pathkeys, root->group_pathkeys) != PATHKEYS_EQUAL)
         {
             info = makeNode(GroupByOrdering);
             info->pathkeys = pathkeys;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c

index 084c6796d0395c90b94ff144c6977169fa3f2c06..4711f912390f6bb40b6ba6159e1f06502e301020 100644 (file)
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -137,7 +137,7 @@ static double preprocess_limit(PlannerInfo *root,
                                double tuple_fraction,
                                int64 *offset_est, int64 *count_est);
  static void remove_useless_groupby_columns(PlannerInfo *root);
-static List *groupclause_apply_groupingset(PlannerInfo *root, List *force);
+static List *preprocess_groupclause(PlannerInfo *root, List *force);
  static List *extract_rollup_sets(List *groupingSets);
  static List *reorder_grouping_sets(List *groupingSets, List *sortclause);
  static void standard_qp_callback(PlannerInfo *root, void *extra);
@@ -1422,7 +1422,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction,
         else if (parse->groupClause)
         {
             /* Preprocess regular GROUP BY clause, if any */
-           root->processed_groupClause = list_copy(parse->groupClause);
+           root->processed_groupClause = preprocess_groupclause(root, NIL);
             /* Remove any redundant GROUP BY columns */
             remove_useless_groupby_columns(root);
         }
@@ -2169,7 +2169,7 @@ preprocess_grouping_sets(PlannerInfo *root)
          * The groupClauses for hashed grouping sets are built later on.)
          */
         if (gs->set)
-           rollup->groupClause = groupclause_apply_groupingset(root, gs->set);
+           rollup->groupClause = preprocess_groupclause(root, gs->set);
         else
             rollup->groupClause = NIL;
  
@@ -2821,24 +2821,106 @@ remove_useless_groupby_columns(PlannerInfo *root)
  }
  
  /*
- * groupclause_apply_groupingset
- *     Apply the order of GROUP BY clauses defined by grouping sets.  Items
- *     not in the grouping set are skipped.
+ * preprocess_groupclause - do preparatory work on GROUP BY clause
+ *
+ * The idea here is to adjust the ordering of the GROUP BY elements
+ * (which in itself is semantically insignificant) to match ORDER BY,
+ * thereby allowing a single sort operation to both implement the ORDER BY
+ * requirement and set up for a Unique step that implements GROUP BY.
+ * We also consider partial match between GROUP BY and ORDER BY elements,
+ * which could allow to implement ORDER BY using the incremental sort.
+ *
+ * We also consider other orderings of the GROUP BY elements, which could
+ * match the sort ordering of other possible plans (eg an indexscan) and
+ * thereby reduce cost.  This is implemented during the generation of grouping
+ * paths.  See get_useful_group_keys_orderings() for details.
+ *
+ * Note: we need no comparable processing of the distinctClause because
+ * the parser already enforced that that matches ORDER BY.
+ *
+ * Note: we return a fresh List, but its elements are the same
+ * SortGroupClauses appearing in parse->groupClause.  This is important
+ * because later processing may modify the processed_groupClause list.
+ *
+ * For grouping sets, the order of items is instead forced to agree with that
+ * of the grouping set (and items not in the grouping set are skipped). The
+ * work of sorting the order of grouping set elements to match the ORDER BY if
+ * possible is done elsewhere.
   */
  static List *
-groupclause_apply_groupingset(PlannerInfo *root, List *gset)
+preprocess_groupclause(PlannerInfo *root, List *force)
  {
     Query      *parse = root->parse;
     List       *new_groupclause = NIL;
     ListCell   *sl;
+   ListCell   *gl;
  
-   foreach(sl, gset)
+   /* For grouping sets, we need to force the ordering */
+   if (force)
     {
-       Index       ref = lfirst_int(sl);
-       SortGroupClause *cl = get_sortgroupref_clause(ref, parse->groupClause);
+       foreach(sl, force)
+       {
+           Index       ref = lfirst_int(sl);
+           SortGroupClause *cl = get_sortgroupref_clause(ref, parse->groupClause);
+
+           new_groupclause = lappend(new_groupclause, cl);
+       }
  
-       new_groupclause = lappend(new_groupclause, cl);
+       return new_groupclause;
     }
+
+   /* If no ORDER BY, nothing useful to do here */
+   if (parse->sortClause == NIL)
+       return list_copy(parse->groupClause);
+
+   /*
+    * Scan the ORDER BY clause and construct a list of matching GROUP BY
+    * items, but only as far as we can make a matching prefix.
+    *
+    * This code assumes that the sortClause contains no duplicate items.
+    */
+   foreach(sl, parse->sortClause)
+   {
+       SortGroupClause *sc = lfirst_node(SortGroupClause, sl);
+
+       foreach(gl, parse->groupClause)
+       {
+           SortGroupClause *gc = lfirst_node(SortGroupClause, gl);
+
+           if (equal(gc, sc))
+           {
+               new_groupclause = lappend(new_groupclause, gc);
+               break;
+           }
+       }
+       if (gl == NULL)
+           break;              /* no match, so stop scanning */
+   }
+
+
+   /* If no match at all, no point in reordering GROUP BY */
+   if (new_groupclause == NIL)
+       return list_copy(parse->groupClause);
+
+   /*
+    * Add any remaining GROUP BY items to the new list.  We don't require a
+    * complete match, because even partial match allows ORDER BY to be
+    * implemented using incremental sort.  Also, give up if there are any
+    * non-sortable GROUP BY items, since then there's no hope anyway.
+    */
+   foreach(gl, parse->groupClause)
+   {
+       SortGroupClause *gc = lfirst_node(SortGroupClause, gl);
+
+       if (list_member_ptr(new_groupclause, gc))
+           continue;           /* it matched an ORDER BY item */
+       if (!OidIsValid(gc->sortop))    /* give up, GROUP BY can't be sorted */
+           return list_copy(parse->groupClause);
+       new_groupclause = lappend(new_groupclause, gc);
+   }
+
+   /* Success --- install the rearranged GROUP BY list */
+   Assert(list_length(parse->groupClause) == list_length(new_groupclause));
     return new_groupclause;
  }
  
@@ -4170,7 +4252,7 @@ consider_groupingsets_paths(PlannerInfo *root,
             {
                 rollup = makeNode(RollupData);
  
-               rollup->groupClause = groupclause_apply_groupingset(root, gset);
+               rollup->groupClause = preprocess_groupclause(root, gset);
                 rollup->gsets_data = list_make1(gs);
                 rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
                                                          rollup->gsets_data,
@@ -4359,7 +4441,7 @@ consider_groupingsets_paths(PlannerInfo *root,
  
             Assert(gs->set != NIL);
  
-           rollup->groupClause = groupclause_apply_groupingset(root, gs->set);
+           rollup->groupClause = preprocess_groupclause(root, gs->set);
             rollup->gsets_data = list_make1(gs);
             rollup->gsets = remap_to_groupclause_idx(rollup->groupClause,
                                                      rollup->gsets_data,
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h

index 78489398294d84dd92c9f22c098bf321d0f122da..2ba297c117290d13660e3e1d0763dec050d98e63 100644 (file)
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -426,7 +426,11 @@ struct PlannerInfo
      * items to be proven redundant, implying that there is only one group
      * containing all the query's rows.  Hence, if you want to check whether
      * GROUP BY was specified, test for nonempty parse->groupClause, not for
-    * nonempty processed_groupClause.
+    * nonempty processed_groupClause.  Optimizer chooses specific order of
+    * group-by clauses during the upper paths generation process, attempting
+    * to use different strategies to minimize number of sorts or engage
+    * incremental sort.  See preprocess_groupclause() and
+    * get_useful_group_keys_orderings() for details.
      *
      * Currently, when grouping sets are specified we do not attempt to
      * optimize the groupClause, so that processed_groupClause will be
diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out

index 1b900fddf8ea552631434032c17fd5947940583a..5f2c0cf5786e3eb817924cfd40d4a35324eef297 100644 (file)
--- a/src/test/regress/expected/partition_aggregate.out
+++ b/src/test/regress/expected/partition_aggregate.out
@@ -901,15 +901,15 @@ SELECT a, c, sum(b), avg(c), count(*) FROM pagg_tab_m GROUP BY (a+b)/2, 2, 1 HAV
     Sort Key: pagg_tab_m.a, pagg_tab_m.c, (sum(pagg_tab_m.b))
     ->  Append
           ->  HashAggregate
-               Group Key: ((pagg_tab_m.a + pagg_tab_m.b) / 2), pagg_tab_m.c, pagg_tab_m.a
+               Group Key: pagg_tab_m.a, pagg_tab_m.c, ((pagg_tab_m.a + pagg_tab_m.b) / 2)
                 Filter: ((sum(pagg_tab_m.b) = 50) AND (avg(pagg_tab_m.c) > '25'::numeric))
                 ->  Seq Scan on pagg_tab_m_p1 pagg_tab_m
           ->  HashAggregate
-               Group Key: ((pagg_tab_m_1.a + pagg_tab_m_1.b) / 2), pagg_tab_m_1.c, pagg_tab_m_1.a
+               Group Key: pagg_tab_m_1.a, pagg_tab_m_1.c, ((pagg_tab_m_1.a + pagg_tab_m_1.b) / 2)
                 Filter: ((sum(pagg_tab_m_1.b) = 50) AND (avg(pagg_tab_m_1.c) > '25'::numeric))
                 ->  Seq Scan on pagg_tab_m_p2 pagg_tab_m_1
           ->  HashAggregate
-               Group Key: ((pagg_tab_m_2.a + pagg_tab_m_2.b) / 2), pagg_tab_m_2.c, pagg_tab_m_2.a
+               Group Key: pagg_tab_m_2.a, pagg_tab_m_2.c, ((pagg_tab_m_2.a + pagg_tab_m_2.b) / 2)
                 Filter: ((sum(pagg_tab_m_2.b) = 50) AND (avg(pagg_tab_m_2.c) > '25'::numeric))
                 ->  Seq Scan on pagg_tab_m_p3 pagg_tab_m_2
  (15 rows)
author	Alexander Korotkov <[email protected]>
	Thu, 6 Jun 2024 10:44:34 +0000 (13:44 +0300)
committer	Alexander Korotkov <[email protected]>
	Thu, 6 Jun 2024 10:44:34 +0000 (13:44 +0300)
src/backend/optimizer/path/pathkeys.c		patch \| blob \| blame \| history
src/backend/optimizer/plan/planner.c		patch \| blob \| blame \| history
src/include/nodes/pathnodes.h		patch \| blob \| blame \| history
src/test/regress/expected/partition_aggregate.out		patch \| blob \| blame \| history