Skip to content

Commit 3c6fc58

Browse files
committed
Have the planner consider Incremental Sort for DISTINCT
Prior to this, we only considered a full sort on the cheapest input path and uniquifying any path which was already sorted in the required sort order. Here we adjust create_final_distinct_paths() so that it also adds an Incremental Sort path on any path which has presorted keys. Additionally, this adjusts the parallel distinct code so that we now consider sorting the cheapest partial path and incrementally sorting any partial paths with presorted keys. Previously we didn't consider any sorting for parallel distinct and only added a unique path atop any path which had the required pathkeys already. Author: David Rowley Reviewed-by: Richard Guo Discussion: https://postgr.es/m/CAApHDvo8Lz2H=42urBbfP65LTcEUOh288MT7DsG2_EWtW1AXHQ@mail.gmail.com
1 parent e5b8a4c commit 3c6fc58

File tree

5 files changed

+173
-111
lines changed

5 files changed

+173
-111
lines changed

src/backend/optimizer/plan/planner.c

+129-93
Original file line numberDiff line numberDiff line change
@@ -4654,22 +4654,63 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
46544654
cheapest_partial_path->rows,
46554655
NULL, NULL);
46564656

4657-
/* first try adding unique paths atop of sorted paths */
4657+
/*
4658+
* Try sorting the cheapest path and incrementally sorting any paths with
4659+
* presorted keys and put a unique paths atop of those.
4660+
*/
46584661
if (grouping_is_sortable(parse->distinctClause))
46594662
{
46604663
foreach(lc, input_rel->partial_pathlist)
46614664
{
4662-
Path *path = (Path *) lfirst(lc);
4665+
Path *input_path = (Path *) lfirst(lc);
4666+
Path *sorted_path;
4667+
bool is_sorted;
4668+
int presorted_keys;
46634669

4664-
if (pathkeys_contained_in(root->distinct_pathkeys, path->pathkeys))
4670+
is_sorted = pathkeys_count_contained_in(root->distinct_pathkeys,
4671+
input_path->pathkeys,
4672+
&presorted_keys);
4673+
4674+
if (is_sorted)
4675+
sorted_path = input_path;
4676+
else
46654677
{
4666-
add_partial_path(partial_distinct_rel, (Path *)
4667-
create_upper_unique_path(root,
4668-
partial_distinct_rel,
4669-
path,
4670-
list_length(root->distinct_pathkeys),
4671-
numDistinctRows));
4678+
/*
4679+
* Try at least sorting the cheapest path and also try
4680+
* incrementally sorting any path which is partially sorted
4681+
* already (no need to deal with paths which have presorted
4682+
* keys when incremental sort is disabled unless it's the
4683+
* cheapest partial path).
4684+
*/
4685+
if (input_path != cheapest_partial_path &&
4686+
(presorted_keys == 0 || !enable_incremental_sort))
4687+
continue;
4688+
4689+
/*
4690+
* We've no need to consider both a sort and incremental sort.
4691+
* We'll just do a sort if there are no presorted keys and an
4692+
* incremental sort when there are presorted keys.
4693+
*/
4694+
if (presorted_keys == 0 || !enable_incremental_sort)
4695+
sorted_path = (Path *) create_sort_path(root,
4696+
partial_distinct_rel,
4697+
input_path,
4698+
root->distinct_pathkeys,
4699+
-1.0);
4700+
else
4701+
sorted_path = (Path *) create_incremental_sort_path(root,
4702+
partial_distinct_rel,
4703+
input_path,
4704+
root->distinct_pathkeys,
4705+
presorted_keys,
4706+
-1.0);
46724707
}
4708+
4709+
add_partial_path(partial_distinct_rel, (Path *)
4710+
create_upper_unique_path(root, partial_distinct_rel,
4711+
sorted_path,
4712+
list_length(root->distinct_pathkeys),
4713+
numDistinctRows));
46734714
}
46744715
}
46754716

@@ -4773,9 +4814,11 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
47734814
if (grouping_is_sortable(parse->distinctClause))
47744815
{
47754816
/*
4776-
* First, if we have any adequately-presorted paths, just stick a
4777-
* Unique node on those. Then consider doing an explicit sort of the
4778-
* cheapest input path and Unique'ing that.
4817+
* Firstly, if we have any adequately-presorted paths, just stick a
4818+
* Unique node on those. We also, consider doing an explicit sort of
4819+
* the cheapest input path and Unique'ing that. If any paths have
4820+
* presorted keys then we'll create an incremental sort atop of those
4821+
* before adding a unique node on the top.
47794822
*
47804823
* When we have DISTINCT ON, we must sort by the more rigorous of
47814824
* DISTINCT and ORDER BY, else it won't have the desired behavior.
@@ -4785,8 +4828,8 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
47854828
* the other.)
47864829
*/
47874830
List *needed_pathkeys;
4788-
Path *path;
47894831
ListCell *lc;
4832+
double limittuples = root->distinct_pathkeys == NIL ? 1.0 : -1.0;
47904833

47914834
if (parse->hasDistinctOn &&
47924835
list_length(root->distinct_pathkeys) <
@@ -4797,96 +4840,89 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
47974840

47984841
foreach(lc, input_rel->pathlist)
47994842
{
4800-
path = (Path *) lfirst(lc);
4843+
Path *input_path = (Path *) lfirst(lc);
4844+
Path *sorted_path;
4845+
bool is_sorted;
4846+
int presorted_keys;
48014847

4802-
if (pathkeys_contained_in(needed_pathkeys, path->pathkeys))
4848+
is_sorted = pathkeys_count_contained_in(needed_pathkeys,
4849+
input_path->pathkeys,
4850+
&presorted_keys);
4851+
4852+
if (is_sorted)
4853+
sorted_path = input_path;
4854+
else
48034855
{
48044856
/*
4805-
* distinct_pathkeys may have become empty if all of the
4806-
* pathkeys were determined to be redundant. If all of the
4807-
* pathkeys are redundant then each DISTINCT target must only
4808-
* allow a single value, therefore all resulting tuples must
4809-
* be identical (or at least indistinguishable by an equality
4810-
* check). We can uniquify these tuples simply by just taking
4811-
* the first tuple. All we do here is add a path to do "LIMIT
4812-
* 1" atop of 'path'. When doing a DISTINCT ON we may still
4813-
* have a non-NIL sort_pathkeys list, so we must still only do
4814-
* this with paths which are correctly sorted by
4815-
* sort_pathkeys.
4857+
* Try at least sorting the cheapest path and also try
4858+
* incrementally sorting any path which is partially sorted
4859+
* already (no need to deal with paths which have presorted
4860+
* keys when incremental sort is disabled unless it's the
4861+
* cheapest input path).
48164862
*/
4817-
if (root->distinct_pathkeys == NIL)
4818-
{
4819-
Node *limitCount;
4820-
4821-
limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
4822-
sizeof(int64),
4823-
Int64GetDatum(1), false,
4824-
FLOAT8PASSBYVAL);
4863+
if (input_path != cheapest_input_path &&
4864+
(presorted_keys == 0 || !enable_incremental_sort))
4865+
continue;
48254866

4826-
/*
4827-
* If the query already has a LIMIT clause, then we could
4828-
* end up with a duplicate LimitPath in the final plan.
4829-
* That does not seem worth troubling over too much.
4830-
*/
4831-
add_path(distinct_rel, (Path *)
4832-
create_limit_path(root, distinct_rel, path, NULL,
4833-
limitCount, LIMIT_OPTION_COUNT,
4834-
0, 1));
4835-
}
4867+
/*
4868+
* We've no need to consider both a sort and incremental sort.
4869+
* We'll just do a sort if there are no presorted keys and an
4870+
* incremental sort when there are presorted keys.
4871+
*/
4872+
if (presorted_keys == 0 || !enable_incremental_sort)
4873+
sorted_path = (Path *) create_sort_path(root,
4874+
distinct_rel,
4875+
input_path,
4876+
needed_pathkeys,
4877+
limittuples);
48364878
else
4837-
{
4838-
add_path(distinct_rel, (Path *)
4839-
create_upper_unique_path(root, distinct_rel,
4840-
path,
4841-
list_length(root->distinct_pathkeys),
4842-
numDistinctRows));
4843-
}
4879+
sorted_path = (Path *) create_incremental_sort_path(root,
4880+
distinct_rel,
4881+
input_path,
4882+
needed_pathkeys,
4883+
presorted_keys,
4884+
limittuples);
48444885
}
4845-
}
48464886

4847-
/* For explicit-sort case, always use the more rigorous clause */
4848-
if (list_length(root->distinct_pathkeys) <
4849-
list_length(root->sort_pathkeys))
4850-
{
4851-
needed_pathkeys = root->sort_pathkeys;
4852-
/* Assert checks that parser didn't mess up... */
4853-
Assert(pathkeys_contained_in(root->distinct_pathkeys,
4854-
needed_pathkeys));
4855-
}
4856-
else
4857-
needed_pathkeys = root->distinct_pathkeys;
4887+
/*
4888+
* distinct_pathkeys may have become empty if all of the pathkeys
4889+
* were determined to be redundant. If all of the pathkeys are
4890+
* redundant then each DISTINCT target must only allow a single
4891+
* value, therefore all resulting tuples must be identical (or at
4892+
* least indistinguishable by an equality check). We can uniquify
4893+
* these tuples simply by just taking the first tuple. All we do
4894+
* here is add a path to do "LIMIT 1" atop of 'sorted_path'. When
4895+
* doing a DISTINCT ON we may still have a non-NIL sort_pathkeys
4896+
* list, so we must still only do this with paths which are
4897+
* correctly sorted by sort_pathkeys.
4898+
*/
4899+
if (root->distinct_pathkeys == NIL)
4900+
{
4901+
Node *limitCount;
48584902

4859-
path = cheapest_input_path;
4860-
if (!pathkeys_contained_in(needed_pathkeys, path->pathkeys))
4861-
path = (Path *) create_sort_path(root, distinct_rel,
4862-
path,
4863-
needed_pathkeys,
4864-
root->distinct_pathkeys == NIL ?
4865-
1.0 : -1.0);
4903+
limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
4904+
sizeof(int64),
4905+
Int64GetDatum(1), false,
4906+
FLOAT8PASSBYVAL);
48664907

4867-
/*
4868-
* As above, use a LimitPath instead of a UniquePath when all of the
4869-
* distinct_pathkeys are redundant and we're only going to get a
4870-
* series of tuples all with the same values anyway.
4871-
*/
4872-
if (root->distinct_pathkeys == NIL)
4873-
{
4874-
Node *limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
4875-
sizeof(int64),
4876-
Int64GetDatum(1), false,
4877-
FLOAT8PASSBYVAL);
4878-
4879-
add_path(distinct_rel, (Path *)
4880-
create_limit_path(root, distinct_rel, path, NULL,
4881-
limitCount, LIMIT_OPTION_COUNT, 0, 1));
4882-
}
4883-
else
4884-
{
4885-
add_path(distinct_rel, (Path *)
4886-
create_upper_unique_path(root, distinct_rel,
4887-
path,
4888-
list_length(root->distinct_pathkeys),
4889-
numDistinctRows));
4908+
/*
4909+
* If the query already has a LIMIT clause, then we could end
4910+
* up with a duplicate LimitPath in the final plan. That does
4911+
* not seem worth troubling over too much.
4912+
*/
4913+
add_path(distinct_rel, (Path *)
4914+
create_limit_path(root, distinct_rel, sorted_path,
4915+
NULL, limitCount,
4916+
LIMIT_OPTION_COUNT, 0, 1));
4917+
}
4918+
else
4919+
{
4920+
add_path(distinct_rel, (Path *)
4921+
create_upper_unique_path(root, distinct_rel,
4922+
sorted_path,
4923+
list_length(root->distinct_pathkeys),
4924+
numDistinctRows));
4925+
}
48904926
}
48914927
}
48924928

src/test/regress/expected/incremental_sort.out

+7-6
Original file line numberDiff line numberDiff line change
@@ -1484,15 +1484,16 @@ explain (costs off) select * from t union select * from t order by 1,3;
14841484
-- Full sort, not just incremental sort can be pushed below a gather merge path
14851485
-- by generate_useful_gather_paths.
14861486
explain (costs off) select distinct a,b from t;
1487-
QUERY PLAN
1488-
------------------------------------------
1487+
QUERY PLAN
1488+
------------------------------------------------
14891489
Unique
14901490
-> Gather Merge
14911491
Workers Planned: 2
1492-
-> Sort
1493-
Sort Key: a, b
1494-
-> Parallel Seq Scan on t
1495-
(6 rows)
1492+
-> Unique
1493+
-> Sort
1494+
Sort Key: a, b
1495+
-> Parallel Seq Scan on t
1496+
(7 rows)
14961497

14971498
drop table t;
14981499
-- Sort pushdown can't go below where expressions are part of the rel target.

src/test/regress/expected/select_distinct.out

+23-8
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,20 @@ SELECT DISTINCT g%1000 FROM generate_series(0,9999) g;
171171
SET jit_above_cost TO DEFAULT;
172172
CREATE TABLE distinct_group_2 AS
173173
SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g;
174+
SET enable_seqscan = 0;
175+
-- Check to see we get an incremental sort plan
176+
EXPLAIN (costs off)
177+
SELECT DISTINCT hundred, two FROM tenk1;
178+
QUERY PLAN
179+
-----------------------------------------------------
180+
Unique
181+
-> Incremental Sort
182+
Sort Key: hundred, two
183+
Presorted Key: hundred
184+
-> Index Scan using tenk1_hundred on tenk1
185+
(5 rows)
186+
187+
RESET enable_seqscan;
174188
SET enable_hashagg=TRUE;
175189
-- Produce results with hash aggregation.
176190
SET enable_sort=FALSE;
@@ -265,15 +279,16 @@ $$ LANGUAGE plpgsql PARALLEL SAFE;
265279
-- Ensure we do parallel distinct now that the function is parallel safe
266280
EXPLAIN (COSTS OFF)
267281
SELECT DISTINCT distinct_func(1) FROM tenk1;
268-
QUERY PLAN
269-
----------------------------------------------
282+
QUERY PLAN
283+
----------------------------------------------------
270284
Unique
271-
-> Sort
272-
Sort Key: (distinct_func(1))
273-
-> Gather
274-
Workers Planned: 2
275-
-> Parallel Seq Scan on tenk1
276-
(6 rows)
285+
-> Gather Merge
286+
Workers Planned: 2
287+
-> Unique
288+
-> Sort
289+
Sort Key: (distinct_func(1))
290+
-> Parallel Seq Scan on tenk1
291+
(7 rows)
277292

278293
RESET max_parallel_workers_per_gather;
279294
RESET min_parallel_table_scan_size;

src/test/regress/expected/window.out

+6-4
Original file line numberDiff line numberDiff line change
@@ -3944,8 +3944,9 @@ ORDER BY depname, enroll_date;
39443944
QUERY PLAN
39453945
-----------------------------------------------------------------------------------------------
39463946
Unique
3947-
-> Sort
3947+
-> Incremental Sort
39483948
Sort Key: depname, enroll_date, empno, (sum(salary) OVER (?)), (min(salary) OVER (?))
3949+
Presorted Key: depname, enroll_date
39493950
-> WindowAgg
39503951
-> Incremental Sort
39513952
Sort Key: depname, enroll_date
@@ -3954,7 +3955,7 @@ ORDER BY depname, enroll_date;
39543955
-> Sort
39553956
Sort Key: depname, empno
39563957
-> Seq Scan on empsalary
3957-
(11 rows)
3958+
(12 rows)
39583959

39593960
-- As above but adjust the ORDER BY clause to help ensure the plan with the
39603961
-- minimum amount of sorting wasn't a fluke.
@@ -3970,8 +3971,9 @@ ORDER BY depname, empno;
39703971
QUERY PLAN
39713972
-----------------------------------------------------------------------------------------------
39723973
Unique
3973-
-> Sort
3974+
-> Incremental Sort
39743975
Sort Key: depname, empno, enroll_date, (sum(salary) OVER (?)), (min(salary) OVER (?))
3976+
Presorted Key: depname, empno
39753977
-> WindowAgg
39763978
-> Incremental Sort
39773979
Sort Key: depname, empno
@@ -3980,7 +3982,7 @@ ORDER BY depname, empno;
39803982
-> Sort
39813983
Sort Key: depname, enroll_date
39823984
-> Seq Scan on empsalary
3983-
(11 rows)
3985+
(12 rows)
39843986

39853987
RESET enable_hashagg;
39863988
-- Test Sort node reordering

src/test/regress/sql/select_distinct.sql

+8
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,14 @@ SET jit_above_cost TO DEFAULT;
6969
CREATE TABLE distinct_group_2 AS
7070
SELECT DISTINCT (g%1000)::text FROM generate_series(0,9999) g;
7171

72+
SET enable_seqscan = 0;
73+
74+
-- Check to see we get an incremental sort plan
75+
EXPLAIN (costs off)
76+
SELECT DISTINCT hundred, two FROM tenk1;
77+
78+
RESET enable_seqscan;
79+
7280
SET enable_hashagg=TRUE;
7381

7482
-- Produce results with hash aggregation.

0 commit comments

Comments
 (0)