diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index d4da0e8dea9..f614a23d573 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -4394,7 +4394,8 @@ estimate_hash_bucket_stats(PlannerInfo *root, Node *hashkey, double nbuckets, { VariableStatData vardata; double estfract, - ndistinct; + ndistinct, + stanullfrac; bool isdefault; AttStatsSlot sslot; @@ -4444,6 +4445,17 @@ estimate_hash_bucket_stats(PlannerInfo *root, Node *hashkey, double nbuckets, return; } + /* Get fraction that are null */ + if (HeapTupleIsValid(vardata.statsTuple)) + { + Form_pg_statistic stats; + + stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple); + stanullfrac = stats->stanullfrac; + } + else + stanullfrac = 0.0; + /* * Adjust ndistinct to account for restriction clauses. Observe we are * assuming that the data distribution is affected uniformly by the @@ -4468,6 +4480,13 @@ estimate_hash_bucket_stats(PlannerInfo *root, Node *hashkey, double nbuckets, else estfract = 1.0 / ndistinct; + /* + * Adjust for null fraction. NULL keys are not inserted into the hash + * table, but inner_path_rows in final_cost_hashjoin includes them, so we + * must discount estfract to compensate. + */ + estfract *= (1.0 - stanullfrac); + /* * Clamp the bucketsize fraction to be not less than the MCV frequency, * since whichever bucket the MCV values end up in will have at least that