diff --git a/src/backend/optimizer/README b/src/backend/optimizer/README index 2339347c24..41c120e0cd 100644 --- a/src/backend/optimizer/README +++ b/src/backend/optimizer/README @@ -1015,6 +1015,7 @@ UPPERREL_SETOP result of UNION/INTERSECT/EXCEPT, if any UPPERREL_PARTIAL_GROUP_AGG result of partial grouping/aggregation, if any UPPERREL_GROUP_AGG result of grouping/aggregation, if any UPPERREL_WINDOW result of window functions, if any +UPPERREL_PARTIAL_DISTINCT result of partial "SELECT DISTINCT", if any UPPERREL_DISTINCT result of "SELECT DISTINCT", if any UPPERREL_ORDERED result of ORDER BY, if any UPPERREL_FINAL result of any remaining top-level actions diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 2cd691191c..d89d215133 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -187,8 +187,11 @@ static void create_one_window_path(PlannerInfo *root, PathTarget *output_target, WindowFuncLists *wflists, List *activeWindows); +static void create_partial_distinct_paths(PlannerInfo *root, + RelOptInfo *input_rel); static RelOptInfo *create_distinct_paths(PlannerInfo *root, - RelOptInfo *input_rel); + RelOptInfo *input_rel, + bool parallel_paths); static RelOptInfo *create_ordered_paths(PlannerInfo *root, RelOptInfo *input_rel, PathTarget *target, @@ -1570,6 +1573,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) */ root->upper_targets[UPPERREL_FINAL] = final_target; root->upper_targets[UPPERREL_ORDERED] = final_target; + root->upper_targets[UPPERREL_PARTIAL_DISTINCT] = sort_input_target; root->upper_targets[UPPERREL_DISTINCT] = sort_input_target; root->upper_targets[UPPERREL_WINDOW] = sort_input_target; root->upper_targets[UPPERREL_GROUP_AGG] = grouping_target; @@ -1619,8 +1623,15 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) */ if (parse->distinctClause) { - current_rel = create_distinct_paths(root, - current_rel); + RelOptInfo *distinct_rel; + + /* handle normal paths */ + distinct_rel = create_distinct_paths(root, current_rel, false); + + /* handle partial paths */ + create_partial_distinct_paths(root, current_rel); + + current_rel = distinct_rel; } } /* end of if (setOperations) */ @@ -4216,6 +4227,102 @@ create_one_window_path(PlannerInfo *root, add_path(window_rel, path); } +/* + * create_partial_distinct_paths + * + * Process 'input_rel' partial paths and add unique / aggregate paths to the + * UPPERREL_PARTIAL_DISTINCT rel. For any paths created, add Gather / + * GatherMerge paths on top and add a final unique / aggregate path to remove + * any duplicates produced from combining the results from parallel workers. + */ +static void +create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel) +{ + RelOptInfo *partial_distinct_rel; + Query *parse; + List *distinctExprs; + double numDistinctRows; + Path *cheapest_partial_path; + ListCell *lc; + + /* nothing to do when there are no partial paths in the input rel */ + if (!input_rel->consider_parallel || input_rel->partial_pathlist == NIL) + return; + + parse = root->parse; + + /* can't do parallel DISTINCT ON */ + if (parse->hasDistinctOn) + return; + + partial_distinct_rel = fetch_upper_rel(root, UPPERREL_PARTIAL_DISTINCT, NULL); + partial_distinct_rel->reltarget = root->upper_targets[UPPERREL_PARTIAL_DISTINCT]; + partial_distinct_rel->consider_parallel = input_rel->consider_parallel; + + cheapest_partial_path = linitial(input_rel->partial_pathlist); + + distinctExprs = get_sortgrouplist_exprs(parse->distinctClause, + parse->targetList); + + /* estimate how many distinct rows we'll get from each worker */ + numDistinctRows = estimate_num_groups(root, distinctExprs, + cheapest_partial_path->rows, + NULL, NULL); + + /* first try adding unique paths atop of sorted paths */ + if (grouping_is_sortable(parse->distinctClause)) + { + foreach(lc, input_rel->partial_pathlist) + { + Path *path = (Path *)lfirst(lc); + + if (pathkeys_contained_in(root->distinct_pathkeys, path->pathkeys)) + { + add_partial_path(partial_distinct_rel, (Path *) + create_upper_unique_path(root, + partial_distinct_rel, + path, + list_length(root->distinct_pathkeys), + numDistinctRows)); + } + } + } + + /* now try hash aggregate paths, if enabled and hashing is possible */ + if (enable_hashagg && grouping_is_hashable(parse->distinctClause)) + { + add_partial_path(partial_distinct_rel, (Path *) + create_agg_path(root, + partial_distinct_rel, + cheapest_partial_path, + cheapest_partial_path->pathtarget, + AGG_HASHED, + AGGSPLIT_SIMPLE, + parse->distinctClause, + NIL, + NULL, + numDistinctRows)); + } + + /* Let extensions possibly add some more paths */ + if (create_upper_paths_hook) + (*create_upper_paths_hook) (root, UPPERREL_PARTIAL_DISTINCT, + input_rel, partial_distinct_rel, NULL); + + if (partial_distinct_rel->partial_pathlist != NIL) + { + generate_gather_paths(root, partial_distinct_rel, true); + set_cheapest(partial_distinct_rel); + + /* + * Finally, create Paths to distinctify the final result. This + * just requires removing any results that are duplicated due to + * combining results from parallel workers. + */ + create_distinct_paths(root, partial_distinct_rel, true); + } +} + /* * create_distinct_paths * @@ -4223,12 +4330,15 @@ create_one_window_path(PlannerInfo *root, * * input_rel: contains the source-data Paths * + * parallel_paths: true if we're processing a set of Gather/GatherMerge paths, + * false if we're processing normal paths. + * * Note: input paths should already compute the desired pathtarget, since * Sort/Unique won't project anything. */ static RelOptInfo * -create_distinct_paths(PlannerInfo *root, - RelOptInfo *input_rel) +create_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, + bool parallel_paths) { Query *parse = root->parse; Path *cheapest_input_path = input_rel->cheapest_total_path; @@ -4392,19 +4502,27 @@ create_distinct_paths(PlannerInfo *root, errdetail("Some of the datatypes only support hashing, while others only support sorting."))); /* - * If there is an FDW that's responsible for all baserels of the query, - * let it consider adding ForeignPaths. + * Skip calling the FDW method and the hook when doing parallel paths. No + * need to call it again as we should already have done so when working on + * the serial paths. */ - if (distinct_rel->fdwroutine && - distinct_rel->fdwroutine->GetForeignUpperPaths) - distinct_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_DISTINCT, - input_rel, distinct_rel, - NULL); - - /* Let extensions possibly add some more paths */ - if (create_upper_paths_hook) - (*create_upper_paths_hook) (root, UPPERREL_DISTINCT, - input_rel, distinct_rel, NULL); + if (!parallel_paths) + { + /* + * If there is an FDW that's responsible for all baserels of the + * query, let it consider adding ForeignPaths. + */ + if (distinct_rel->fdwroutine && + distinct_rel->fdwroutine->GetForeignUpperPaths) + distinct_rel->fdwroutine->GetForeignUpperPaths(root, UPPERREL_DISTINCT, + input_rel, distinct_rel, + NULL); + + /* Let extensions possibly add some more paths */ + if (create_upper_paths_hook) + (*create_upper_paths_hook) (root, UPPERREL_DISTINCT, + input_rel, distinct_rel, NULL); + } /* Now choose the best path(s) */ set_cheapest(distinct_rel); diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 6e068f2c8b..60afc2c4ae 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -71,6 +71,8 @@ typedef enum UpperRelationKind * any */ UPPERREL_GROUP_AGG, /* result of grouping/aggregation, if any */ UPPERREL_WINDOW, /* result of window functions, if any */ + UPPERREL_PARTIAL_DISTINCT, /* result of partial "SELECT DISTINCT", if + * any */ UPPERREL_DISTINCT, /* result of "SELECT DISTINCT", if any */ UPPERREL_ORDERED, /* result of ORDER BY, if any */ UPPERREL_FINAL /* result of any remaining top-level actions */ diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out index 11c6f50fbf..8da8999954 100644 --- a/src/test/regress/expected/select_distinct.out +++ b/src/test/regress/expected/select_distinct.out @@ -210,6 +210,54 @@ DROP TABLE distinct_hash_1; DROP TABLE distinct_hash_2; DROP TABLE distinct_group_1; DROP TABLE distinct_group_2; +-- Test parallel DISTINCT +SET parallel_tuple_cost=0; +SET parallel_setup_cost=0; +SET min_parallel_table_scan_size=0; +-- Ensure we get a parallel plan +EXPLAIN (costs off) +SELECT DISTINCT twenty FROM tenk1; + QUERY PLAN +---------------------------------------------------- + Unique + -> Sort + Sort Key: twenty + -> Gather + Workers Planned: 2 + -> HashAggregate + Group Key: twenty + -> Parallel Seq Scan on tenk1 +(8 rows) + +-- Ensure the parallel plan produces the correct results +SELECT DISTINCT twenty FROM tenk1; + twenty +-------- + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 + 11 + 12 + 13 + 14 + 15 + 16 + 17 + 18 + 19 +(20 rows) + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; -- -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its -- very own regression file. diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql index 33102744eb..a589e08941 100644 --- a/src/test/regress/sql/select_distinct.sql +++ b/src/test/regress/sql/select_distinct.sql @@ -107,6 +107,22 @@ DROP TABLE distinct_hash_2; DROP TABLE distinct_group_1; DROP TABLE distinct_group_2; +-- Test parallel DISTINCT +SET parallel_tuple_cost=0; +SET parallel_setup_cost=0; +SET min_parallel_table_scan_size=0; + +-- Ensure we get a parallel plan +EXPLAIN (costs off) +SELECT DISTINCT twenty FROM tenk1; + +-- Ensure the parallel plan produces the correct results +SELECT DISTINCT twenty FROM tenk1; + +RESET parallel_setup_cost; +RESET parallel_tuple_cost; +RESET min_parallel_table_scan_size; + -- -- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its -- very own regression file.