From 682a1fa6f591fd687cb80c78a0968631e4e0987b Mon Sep 17 00:00:00 2001 From: "Paul A. Jungwirth" Date: Sat, 25 Nov 2023 09:12:52 -0800 Subject: [PATCH v1] Use statitics for estimating UNNEST(column) rows Extends estimate_array_length to consult DECHIST statistics when called with a Var. This improves planning with array_unnest_support (added in a391ff3c3d41) and should give more accurate results for other callers too. DECHIST has the average *distinct* element count which isn't exactly what we want, but it should still be an improvement over a fixed 10. --- src/backend/optimizer/path/costsize.c | 8 ++--- src/backend/utils/adt/array_typanalyze.c | 1 + src/backend/utils/adt/arrayfuncs.c | 2 +- src/backend/utils/adt/selfuncs.c | 39 +++++++++++++++++++----- src/include/utils/selfuncs.h | 2 +- src/test/regress/expected/arrays.out | 10 ++++++ src/test/regress/sql/arrays.sql | 4 +++ 7 files changed, 53 insertions(+), 13 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index d6ceafd51c5..8bcdcc7fd84 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -1254,7 +1254,7 @@ cost_tidscan(Path *path, PlannerInfo *root, ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) qual; Node *arraynode = (Node *) lsecond(saop->args); - ntuples += estimate_array_length(arraynode); + ntuples += estimate_array_length(root, arraynode); } else if (IsA(qual, CurrentOfExpr)) { @@ -4741,7 +4741,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context) Node *arraynode = (Node *) lsecond(saop->args); QualCost sacosts; QualCost hcosts; - int estarraylen = estimate_array_length(arraynode); + int estarraylen = estimate_array_length(context->root, arraynode); set_sa_opfuncid(saop); sacosts.startup = sacosts.per_tuple = 0; @@ -4779,7 +4779,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context) */ context->total.startup += sacosts.startup; context->total.per_tuple += sacosts.per_tuple * - estimate_array_length(arraynode) * 0.5; + estimate_array_length(context->root, arraynode) * 0.5; } } else if (IsA(node, Aggref) || @@ -4830,7 +4830,7 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context) context->total.startup += perelemcost.startup; if (perelemcost.per_tuple > 0) context->total.per_tuple += perelemcost.per_tuple * - estimate_array_length((Node *) acoerce->arg); + estimate_array_length(context->root, (Node *) acoerce->arg); } else if (IsA(node, RowCompareExpr)) { diff --git a/src/backend/utils/adt/array_typanalyze.c b/src/backend/utils/adt/array_typanalyze.c index 04b3764b686..a60d5fb5d24 100644 --- a/src/backend/utils/adt/array_typanalyze.c +++ b/src/backend/utils/adt/array_typanalyze.c @@ -604,6 +604,7 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc, /* * Prepare to fill stanumbers with the histogram, followed by the * average count. This array must be stored in anl_context. + * We use the average count in estimate_array_length. */ hist = (float4 *) MemoryContextAlloc(stats->anl_context, diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c index 631012a0f28..f902deff025 100644 --- a/src/backend/utils/adt/arrayfuncs.c +++ b/src/backend/utils/adt/arrayfuncs.c @@ -6337,7 +6337,7 @@ array_unnest_support(PG_FUNCTION_ARGS) /* We can use estimated argument values here */ arg1 = estimate_expression_value(req->root, linitial(args)); - req->rows = estimate_array_length(arg1); + req->rows = estimate_array_length(req->root, arg1); ret = (Node *) req; } } diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 35c9e3c86fe..807c4e5ba80 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -2131,7 +2131,7 @@ scalararraysel(PlannerInfo *root, * It's important that this agree with scalararraysel. */ int -estimate_array_length(Node *arrayexpr) +estimate_array_length(PlannerInfo *root, Node *arrayexpr) { /* look through any binary-compatible relabeling of arrayexpr */ arrayexpr = strip_array_coercion(arrayexpr); @@ -2152,11 +2152,36 @@ estimate_array_length(Node *arrayexpr) { return list_length(((ArrayExpr *) arrayexpr)->elements); } - else + else if (arrayexpr && IsA(arrayexpr, Var)) { - /* default guess --- see also scalararraysel */ - return 10; + /* + * It's a column, so use its average elem count. + * This is stored in the last stanumbers element of the DECHIST statistics. + * Actually that is the count of *distinct* elements, + * but this is still better than 10. + */ + VariableStatData vardata; + AttStatsSlot sslot; + int nelem = -1; + + examine_variable(root, arrayexpr, 0, &vardata); + + if (HeapTupleIsValid(vardata.statsTuple)) + { + if (get_attstatsslot(&sslot, vardata.statsTuple, STATISTIC_KIND_DECHIST, InvalidOid, ATTSTATSSLOT_NUMBERS)) + { + nelem = sslot.numbers[sslot.nnumbers - 1]; + free_attstatsslot(&sslot); + } + } + ReleaseVariableStats(vardata); + + if (nelem >= 0) + return nelem; } + + /* default guess --- see also scalararraysel */ + return 10; } /* @@ -6538,7 +6563,7 @@ genericcostestimate(PlannerInfo *root, if (IsA(rinfo->clause, ScalarArrayOpExpr)) { ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) rinfo->clause; - int alength = estimate_array_length(lsecond(saop->args)); + int alength = estimate_array_length(root, lsecond(saop->args)); if (alength > 1) num_sa_scans *= alength; @@ -6818,7 +6843,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, { ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause; Node *other_operand = (Node *) lsecond(saop->args); - int alength = estimate_array_length(other_operand); + int alength = estimate_array_length(root, other_operand); clause_op = saop->opno; found_saop = true; @@ -7412,7 +7437,7 @@ gincost_scalararrayopexpr(PlannerInfo *root, { counts->exactEntries++; counts->searchEntries++; - counts->arrayScans *= estimate_array_length(rightop); + counts->arrayScans *= estimate_array_length(root, rightop); return true; } diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h index 2f76c473db4..bf9678157f7 100644 --- a/src/include/utils/selfuncs.h +++ b/src/include/utils/selfuncs.h @@ -200,7 +200,7 @@ extern Selectivity scalararraysel(PlannerInfo *root, ScalarArrayOpExpr *clause, bool is_join_clause, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo); -extern int estimate_array_length(Node *arrayexpr); +extern int estimate_array_length(PlannerInfo *root, Node *arrayexpr); extern Selectivity rowcomparesel(PlannerInfo *root, RowCompareExpr *clause, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo); diff --git a/src/test/regress/expected/arrays.out b/src/test/regress/expected/arrays.out index 23404982f71..96a0c4a1e34 100644 --- a/src/test/regress/expected/arrays.out +++ b/src/test/regress/expected/arrays.out @@ -2699,3 +2699,13 @@ SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail ERROR: sample size must be between 0 and 6 SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail ERROR: sample size must be between 0 and 6 +-- test rowcount estimate of unnest(column) +analyze arrtest; +explain select elem from arrtest, unnest(a) x(elem); + QUERY PLAN +------------------------------------------------------------------- + Nested Loop (cost=0.00..1.15 rows=6 width=2) + -> Seq Scan on arrtest (cost=0.00..1.03 rows=3 width=29) + -> Function Scan on unnest x (cost=0.00..0.02 rows=2 width=2) +(3 rows) + diff --git a/src/test/regress/sql/arrays.sql b/src/test/regress/sql/arrays.sql index 50aa539fdc1..21507cf8129 100644 --- a/src/test/regress/sql/arrays.sql +++ b/src/test/regress/sql/arrays.sql @@ -825,3 +825,7 @@ SELECT array_dims(array_sample('[-1:2][2:3]={{1,2},{3,NULL},{5,6},{7,8}}'::int[] SELECT array_dims(array_sample('{{{1,2},{3,NULL}},{{5,6},{7,8}},{{9,10},{11,12}}}'::int[], 2)); SELECT array_sample('{1,2,3,4,5,6}'::int[], -1); -- fail SELECT array_sample('{1,2,3,4,5,6}'::int[], 7); --fail + +-- test rowcount estimate of unnest(column) +analyze arrtest; +explain select elem from arrtest, unnest(a) x(elem); -- 2.32.0 (Apple Git-132)