Skip to content

Commit 22c4e88

Browse files
committed
Allow parallel DISTINCT
We've supported parallel aggregation since e06a389. At the time, we didn't quite get around to also adding parallel DISTINCT. So, let's do that now. This is implemented by introducing a two-phase DISTINCT. Phase 1 is performed on parallel workers, rows are made distinct there either by hashing or by sort/unique. The results from the parallel workers are combined and the final distinct phase is performed serially to get rid of any duplicate rows that appear due to combining rows for each of the parallel workers. Author: David Rowley Reviewed-by: Zhihong Yu Discussion: https://postgr.es/m/CAApHDvrjRxVKwQN0he79xS+9wyotFXL=RmoWqGGO2N45Farpgw@mail.gmail.com
1 parent 26ae660 commit 22c4e88

File tree

5 files changed

+292
-33
lines changed

5 files changed

+292
-33
lines changed

src/backend/optimizer/README

+1
Original file line numberDiff line numberDiff line change
@@ -1015,6 +1015,7 @@ UPPERREL_SETOP result of UNION/INTERSECT/EXCEPT, if any
10151015
UPPERREL_PARTIAL_GROUP_AGG result of partial grouping/aggregation, if any
10161016
UPPERREL_GROUP_AGG result of grouping/aggregation, if any
10171017
UPPERREL_WINDOW result of window functions, if any
1018+
UPPERREL_PARTIAL_DISTINCT result of partial "SELECT DISTINCT", if any
10181019
UPPERREL_DISTINCT result of "SELECT DISTINCT", if any
10191020
UPPERREL_ORDERED result of ORDER BY, if any
10201021
UPPERREL_FINAL result of any remaining top-level actions

src/backend/optimizer/plan/planner.c

+186-33
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,12 @@ static void create_one_window_path(PlannerInfo *root,
189189
List *activeWindows);
190190
static RelOptInfo *create_distinct_paths(PlannerInfo *root,
191191
RelOptInfo *input_rel);
192+
static void create_partial_distinct_paths(PlannerInfo *root,
193+
RelOptInfo *input_rel,
194+
RelOptInfo *final_distinct_rel);
195+
static RelOptInfo *create_final_distinct_paths(PlannerInfo *root,
196+
RelOptInfo *input_rel,
197+
RelOptInfo *distinct_rel);
192198
static RelOptInfo *create_ordered_paths(PlannerInfo *root,
193199
RelOptInfo *input_rel,
194200
PathTarget *target,
@@ -1570,6 +1576,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
15701576
*/
15711577
root->upper_targets[UPPERREL_FINAL] = final_target;
15721578
root->upper_targets[UPPERREL_ORDERED] = final_target;
1579+
root->upper_targets[UPPERREL_PARTIAL_DISTINCT] = sort_input_target;
15731580
root->upper_targets[UPPERREL_DISTINCT] = sort_input_target;
15741581
root->upper_targets[UPPERREL_WINDOW] = sort_input_target;
15751582
root->upper_targets[UPPERREL_GROUP_AGG] = grouping_target;
@@ -4227,16 +4234,9 @@ create_one_window_path(PlannerInfo *root,
42274234
* Sort/Unique won't project anything.
42284235
*/
42294236
static RelOptInfo *
4230-
create_distinct_paths(PlannerInfo *root,
4231-
RelOptInfo *input_rel)
4237+
create_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel)
42324238
{
4233-
Query *parse = root->parse;
4234-
Path *cheapest_input_path = input_rel->cheapest_total_path;
42354239
RelOptInfo *distinct_rel;
4236-
double numDistinctRows;
4237-
bool allow_hash;
4238-
Path *path;
4239-
ListCell *lc;
42404240

42414241
/* For now, do all work in the (DISTINCT, NULL) upperrel */
42424242
distinct_rel = fetch_upper_rel(root, UPPERREL_DISTINCT, NULL);
@@ -4258,6 +4258,184 @@ create_distinct_paths(PlannerInfo *root,
42584258
distinct_rel->useridiscurrent = input_rel->useridiscurrent;
42594259
distinct_rel->fdwroutine = input_rel->fdwroutine;
42604260

4261+
/* build distinct paths based on input_rel's pathlist */
4262+
create_final_distinct_paths(root, input_rel, distinct_rel);
4263+
4264+
/* now build distinct paths based on input_rel's partial_pathlist */
4265+
create_partial_distinct_paths(root, input_rel, distinct_rel);
4266+
4267+
/* Give a helpful error if we failed to create any paths */
4268+
if (distinct_rel->pathlist == NIL)
4269+
ereport(ERROR,
4270+
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4271+
errmsg("could not implement DISTINCT"),
4272+
errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
4273+
4274+
/*
4275+
* If there is an FDW that's responsible for all baserels of the query,
4276+
* let it consider adding ForeignPaths.
4277+
*/
4278+
if (distinct_rel->fdwroutine &&
4279+
distinct_rel->fdwroutine->GetForeignUpperPaths)
4280+
distinct_rel->fdwroutine->GetForeignUpperPaths(root,
4281+
UPPERREL_DISTINCT,
4282+
input_rel,
4283+
distinct_rel,
4284+
NULL);
4285+
4286+
/* Let extensions possibly add some more paths */
4287+
if (create_upper_paths_hook)
4288+
(*create_upper_paths_hook) (root, UPPERREL_DISTINCT, input_rel,
4289+
distinct_rel, NULL);
4290+
4291+
/* Now choose the best path(s) */
4292+
set_cheapest(distinct_rel);
4293+
4294+
return distinct_rel;
4295+
}
4296+
4297+
/*
4298+
* create_partial_distinct_paths
4299+
*
4300+
* Process 'input_rel' partial paths and add unique/aggregate paths to the
4301+
* UPPERREL_PARTIAL_DISTINCT rel. For paths created, add Gather/GatherMerge
4302+
* paths on top and add a final unique/aggregate path to remove any duplicate
4303+
* produced from combining rows from parallel workers.
4304+
*/
4305+
static void
4306+
create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
4307+
RelOptInfo *final_distinct_rel)
4308+
{
4309+
RelOptInfo *partial_distinct_rel;
4310+
Query *parse;
4311+
List *distinctExprs;
4312+
double numDistinctRows;
4313+
Path *cheapest_partial_path;
4314+
ListCell *lc;
4315+
4316+
/* nothing to do when there are no partial paths in the input rel */
4317+
if (!input_rel->consider_parallel || input_rel->partial_pathlist == NIL)
4318+
return;
4319+
4320+
parse = root->parse;
4321+
4322+
/* can't do parallel DISTINCT ON */
4323+
if (parse->hasDistinctOn)
4324+
return;
4325+
4326+
partial_distinct_rel = fetch_upper_rel(root, UPPERREL_PARTIAL_DISTINCT,
4327+
NULL);
4328+
partial_distinct_rel->reltarget = root->upper_targets[UPPERREL_PARTIAL_DISTINCT];
4329+
partial_distinct_rel->consider_parallel = input_rel->consider_parallel;
4330+
4331+
/*
4332+
* If input_rel belongs to a single FDW, so does the partial_distinct_rel.
4333+
*/
4334+
partial_distinct_rel->serverid = input_rel->serverid;
4335+
partial_distinct_rel->userid = input_rel->userid;
4336+
partial_distinct_rel->useridiscurrent = input_rel->useridiscurrent;
4337+
partial_distinct_rel->fdwroutine = input_rel->fdwroutine;
4338+
4339+
cheapest_partial_path = linitial(input_rel->partial_pathlist);
4340+
4341+
distinctExprs = get_sortgrouplist_exprs(parse->distinctClause,
4342+
parse->targetList);
4343+
4344+
/* estimate how many distinct rows we'll get from each worker */
4345+
numDistinctRows = estimate_num_groups(root, distinctExprs,
4346+
cheapest_partial_path->rows,
4347+
NULL, NULL);
4348+
4349+
/* first try adding unique paths atop of sorted paths */
4350+
if (grouping_is_sortable(parse->distinctClause))
4351+
{
4352+
foreach(lc, input_rel->partial_pathlist)
4353+
{
4354+
Path *path = (Path *) lfirst(lc);
4355+
4356+
if (pathkeys_contained_in(root->distinct_pathkeys, path->pathkeys))
4357+
{
4358+
add_partial_path(partial_distinct_rel, (Path *)
4359+
create_upper_unique_path(root,
4360+
partial_distinct_rel,
4361+
path,
4362+
list_length(root->distinct_pathkeys),
4363+
numDistinctRows));
4364+
}
4365+
}
4366+
}
4367+
4368+
/*
4369+
* Now try hash aggregate paths, if enabled and hashing is possible. Since
4370+
* we're not on the hook to ensure we do our best to create at least one
4371+
* path here, we treat enable_hashagg as a hard off-switch rather than the
4372+
* slightly softer variant in create_final_distinct_paths.
4373+
*/
4374+
if (enable_hashagg && grouping_is_hashable(parse->distinctClause))
4375+
{
4376+
add_partial_path(partial_distinct_rel, (Path *)
4377+
create_agg_path(root,
4378+
partial_distinct_rel,
4379+
cheapest_partial_path,
4380+
cheapest_partial_path->pathtarget,
4381+
AGG_HASHED,
4382+
AGGSPLIT_SIMPLE,
4383+
parse->distinctClause,
4384+
NIL,
4385+
NULL,
4386+
numDistinctRows));
4387+
}
4388+
4389+
/*
4390+
* If there is an FDW that's responsible for all baserels of the query,
4391+
* let it consider adding ForeignPaths.
4392+
*/
4393+
if (partial_distinct_rel->fdwroutine &&
4394+
partial_distinct_rel->fdwroutine->GetForeignUpperPaths)
4395+
partial_distinct_rel->fdwroutine->GetForeignUpperPaths(root,
4396+
UPPERREL_PARTIAL_DISTINCT,
4397+
input_rel,
4398+
partial_distinct_rel,
4399+
NULL);
4400+
4401+
/* Let extensions possibly add some more partial paths */
4402+
if (create_upper_paths_hook)
4403+
(*create_upper_paths_hook) (root, UPPERREL_PARTIAL_DISTINCT,
4404+
input_rel, partial_distinct_rel, NULL);
4405+
4406+
if (partial_distinct_rel->partial_pathlist != NIL)
4407+
{
4408+
generate_gather_paths(root, partial_distinct_rel, true);
4409+
set_cheapest(partial_distinct_rel);
4410+
4411+
/*
4412+
* Finally, create paths to distinctify the final result. This step
4413+
* is needed to remove any duplicates due to combining rows from
4414+
* parallel workers.
4415+
*/
4416+
create_final_distinct_paths(root, partial_distinct_rel,
4417+
final_distinct_rel);
4418+
}
4419+
}
4420+
4421+
/*
4422+
* create_final_distinct_paths
4423+
* Create distinct paths in 'distinct_rel' based on 'input_rel' pathlist
4424+
*
4425+
* input_rel: contains the source-data paths
4426+
* distinct_rel: destination relation for storing created paths
4427+
*/
4428+
static RelOptInfo *
4429+
create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
4430+
RelOptInfo *distinct_rel)
4431+
{
4432+
Query *parse = root->parse;
4433+
Path *cheapest_input_path = input_rel->cheapest_total_path;
4434+
double numDistinctRows;
4435+
bool allow_hash;
4436+
Path *path;
4437+
ListCell *lc;
4438+
42614439
/* Estimate number of distinct rows there will be */
42624440
if (parse->groupClause || parse->groupingSets || parse->hasAggs ||
42634441
root->hasHavingQual)
@@ -4384,31 +4562,6 @@ create_distinct_paths(PlannerInfo *root,
43844562
numDistinctRows));
43854563
}
43864564

4387-
/* Give a helpful error if we failed to find any implementation */
4388-
if (distinct_rel->pathlist == NIL)
4389-
ereport(ERROR,
4390-
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
4391-
errmsg("could not implement DISTINCT"),
4392-
errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
4393-
4394-
/*
4395-
* If there is an FDW that's responsible for all baserels of the query,
4396-
* let it consider adding ForeignPaths.
4397-
*/
4398-
if (distinct_rel->fdwroutine &&
4399-
distinct_rel->fdwroutine->GetForeignUpperPaths)
4400-
distinct_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_DISTINCT,
4401-
input_rel, distinct_rel,
4402-
NULL);
4403-
4404-
/* Let extensions possibly add some more paths */
4405-
if (create_upper_paths_hook)
4406-
(*create_upper_paths_hook) (root, UPPERREL_DISTINCT,
4407-
input_rel, distinct_rel, NULL);
4408-
4409-
/* Now choose the best path(s) */
4410-
set_cheapest(distinct_rel);
4411-
44124565
return distinct_rel;
44134566
}
44144567

src/include/nodes/pathnodes.h

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ typedef enum UpperRelationKind
7171
* any */
7272
UPPERREL_GROUP_AGG, /* result of grouping/aggregation, if any */
7373
UPPERREL_WINDOW, /* result of window functions, if any */
74+
UPPERREL_PARTIAL_DISTINCT, /* result of partial "SELECT DISTINCT", if any */
7475
UPPERREL_DISTINCT, /* result of "SELECT DISTINCT", if any */
7576
UPPERREL_ORDERED, /* result of ORDER BY, if any */
7677
UPPERREL_FINAL /* result of any remaining top-level actions */

src/test/regress/expected/select_distinct.out

+67
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,73 @@ DROP TABLE distinct_hash_1;
210210
DROP TABLE distinct_hash_2;
211211
DROP TABLE distinct_group_1;
212212
DROP TABLE distinct_group_2;
213+
-- Test parallel DISTINCT
214+
SET parallel_tuple_cost=0;
215+
SET parallel_setup_cost=0;
216+
SET min_parallel_table_scan_size=0;
217+
-- Ensure we get a parallel plan
218+
EXPLAIN (costs off)
219+
SELECT DISTINCT four FROM tenk1;
220+
QUERY PLAN
221+
----------------------------------------------------
222+
Unique
223+
-> Sort
224+
Sort Key: four
225+
-> Gather
226+
Workers Planned: 2
227+
-> HashAggregate
228+
Group Key: four
229+
-> Parallel Seq Scan on tenk1
230+
(8 rows)
231+
232+
-- Ensure the parallel plan produces the correct results
233+
SELECT DISTINCT four FROM tenk1;
234+
four
235+
------
236+
0
237+
1
238+
2
239+
3
240+
(4 rows)
241+
242+
CREATE OR REPLACE FUNCTION distinct_func(a INT) RETURNS INT AS $$
243+
BEGIN
244+
RETURN a;
245+
END;
246+
$$ LANGUAGE plpgsql PARALLEL UNSAFE;
247+
-- Ensure we don't do parallel distinct with a parallel unsafe function
248+
EXPLAIN (COSTS OFF)
249+
SELECT DISTINCT distinct_func(1) FROM tenk1;
250+
QUERY PLAN
251+
----------------------------------------------------------
252+
Unique
253+
-> Sort
254+
Sort Key: (distinct_func(1))
255+
-> Index Only Scan using tenk1_hundred on tenk1
256+
(4 rows)
257+
258+
-- make the function parallel safe
259+
CREATE OR REPLACE FUNCTION distinct_func(a INT) RETURNS INT AS $$
260+
BEGIN
261+
RETURN a;
262+
END;
263+
$$ LANGUAGE plpgsql PARALLEL SAFE;
264+
-- Ensure we do parallel distinct now that the function is parallel safe
265+
EXPLAIN (COSTS OFF)
266+
SELECT DISTINCT distinct_func(1) FROM tenk1;
267+
QUERY PLAN
268+
----------------------------------------------
269+
Unique
270+
-> Sort
271+
Sort Key: (distinct_func(1))
272+
-> Gather
273+
Workers Planned: 2
274+
-> Parallel Seq Scan on tenk1
275+
(6 rows)
276+
277+
RESET min_parallel_table_scan_size;
278+
RESET parallel_setup_cost;
279+
RESET parallel_tuple_cost;
213280
--
214281
-- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
215282
-- very own regression file.

src/test/regress/sql/select_distinct.sql

+37
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,43 @@ DROP TABLE distinct_hash_2;
107107
DROP TABLE distinct_group_1;
108108
DROP TABLE distinct_group_2;
109109

110+
-- Test parallel DISTINCT
111+
SET parallel_tuple_cost=0;
112+
SET parallel_setup_cost=0;
113+
SET min_parallel_table_scan_size=0;
114+
115+
-- Ensure we get a parallel plan
116+
EXPLAIN (costs off)
117+
SELECT DISTINCT four FROM tenk1;
118+
119+
-- Ensure the parallel plan produces the correct results
120+
SELECT DISTINCT four FROM tenk1;
121+
122+
CREATE OR REPLACE FUNCTION distinct_func(a INT) RETURNS INT AS $$
123+
BEGIN
124+
RETURN a;
125+
END;
126+
$$ LANGUAGE plpgsql PARALLEL UNSAFE;
127+
128+
-- Ensure we don't do parallel distinct with a parallel unsafe function
129+
EXPLAIN (COSTS OFF)
130+
SELECT DISTINCT distinct_func(1) FROM tenk1;
131+
132+
-- make the function parallel safe
133+
CREATE OR REPLACE FUNCTION distinct_func(a INT) RETURNS INT AS $$
134+
BEGIN
135+
RETURN a;
136+
END;
137+
$$ LANGUAGE plpgsql PARALLEL SAFE;
138+
139+
-- Ensure we do parallel distinct now that the function is parallel safe
140+
EXPLAIN (COSTS OFF)
141+
SELECT DISTINCT distinct_func(1) FROM tenk1;
142+
143+
RESET min_parallel_table_scan_size;
144+
RESET parallel_setup_cost;
145+
RESET parallel_tuple_cost;
146+
110147
--
111148
-- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
112149
-- very own regression file.

0 commit comments

Comments
 (0)