From acecea4421a69d70ba07786682237872a3deb50e Mon Sep 17 00:00:00 2001 From: Alexandra Wang Date: Mon, 26 Jan 2026 14:00:59 -0800 Subject: [PATCH v1 2/2] Automatically create join MCV statistics from FK constraints This commit adds automatic detection and creation of join MCV statistics during ANALYZE when the table has foreign key constraints and the referenced table has functional dependencies. Example: -- Given these tables: CREATE TABLE keywords (id INT PRIMARY KEY, keyword TEXT); CREATE TABLE movie_keywords ( movie_id INT, keyword_id INT REFERENCES keywords(id) ); -- And functional dependencies on referenced table: CREATE STATISTICS kw_deps (dependencies) ON id, keyword FROM keywords; -- ANALYZE automatically detects the FK and creates join MCV stats for -- movie_keywords.keyword_id and keywords.keyword. ANALYZE movie_keywords; Current limitation: currently, the functional dependency values are just assumed to be 1, which could lead to less accurate estimation. --- src/backend/statistics/extended_stats.c | 494 ++++++++++++++++++ .../statistics/extended_stats_internal.h | 11 + .../regress/expected/stats_ext_crossrel.out | 249 +++++++++ src/test/regress/sql/stats_ext_crossrel.sql | 140 +++++ 4 files changed, 894 insertions(+) diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 8939f56cc67..5b8b4cf6c42 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -20,7 +20,11 @@ #include "access/genam.h" #include "access/htup_details.h" #include "access/table.h" +#include "access/tableam.h" +#include "catalog/dependency.h" #include "catalog/indexing.h" +#include "catalog/pg_constraint.h" +#include "catalog/pg_namespace.h" #include "catalog/pg_statistic_ext.h" #include "catalog/pg_statistic_ext_data.h" #include "commands/defrem.h" @@ -129,6 +133,22 @@ bms_to_int2vector(Bitmapset *bms) return buildint2vector(attnums, nmembers); } +/* Join MCV functions */ +static bool statext_join_mcv_exists(List *statslist, + Oid primary_rel, AttrNumber primary_joinkey_attr, + Oid other_rel, AttrNumber other_joinkey_attr, + AttrNumber filter_attr); +static void build_implicit_join_mcv_stats_from_fk(Relation onerel, List *statslist, + Relation pg_stext, bool inh, + int numrows, HeapTuple *rows, + int natts, VacAttrStats **vacattrstats); +static List *detect_join_stats_candidates(Relation rel); +static Oid create_implicit_join_stat(Relation pg_stext, + Oid referencing_rel, + AttrNumber referencing_attr, + Oid referenced_rel, + AttrNumber referenced_attr, + AttrNumber filter_attr); /* * Compute requested extended stats, using the rows sampled for the plain @@ -320,6 +340,14 @@ BuildRelationExtStatistics(Relation onerel, bool inh, double totalrows, MemoryContextReset(cxt); } + /* + * Now that we've built stats for explicit and previously existing + * implicit stats objects, detect and build new implicit FK-based join + * stats if available. + */ + build_implicit_join_mcv_stats_from_fk(onerel, statslist, pg_stext, inh, + numrows, rows, natts, vacattrstats); + MemoryContextSwitchTo(oldcxt); MemoryContextDelete(cxt); @@ -2807,3 +2835,469 @@ make_build_data(Relation rel, StatExtEntry *stat, int numrows, HeapTuple *rows, return result; } + +/* + * statext_join_mcv_exists + * Check if a join mcv statistic already exists in the statslist. + * + * This prevents duplicate creation during FK detection when the stat + * was already manually created or exists from a previous ANALYZE. + */ +static bool +statext_join_mcv_exists(List *statslist, + Oid primary_rel, AttrNumber primary_joinkey_attr, + Oid other_rel, AttrNumber other_joinkey_attr, + AttrNumber filter_attr) +{ + ListCell *lc; + + foreach(lc, statslist) + { + StatExtEntry *stat = (StatExtEntry *) lfirst(lc); + + if (!list_member_int(stat->types, STATS_EXT_JOIN_MCV)) + continue; + + if (stat->otherrel != other_rel) + continue; + + /* Check if join keys match: [target_joinkey, other_joinkey] */ + if (!stat->joinkeys || stat->joinkeys->dim1 != 2) + continue; + if (stat->joinkeys->values[0] != primary_joinkey_attr || + stat->joinkeys->values[1] != other_joinkey_attr) + continue; + + /* Check if filter column matches */ + if (bms_num_members(stat->columns) != 1) + continue; + if (!bms_is_member(filter_attr, stat->columns)) + continue; + + /* Found a match! */ + return true; + } + + return false; +} + +/* + * get_functional_dependents + * Find columns in referenced table that are functionally dependent on + * the referenced column. + * + * Scans pg_statistic_ext for the referenced table, loads any functional + * dependency statistics, and returns a list of AttrNumbers for columns + * that have a dependency: referenced_attr → column + * + * Returns: List of AttrNumbers + */ +static List * +get_functional_dependents(Oid referenced_rel, AttrNumber referenced_attr) +{ + List *dependents = NIL; + Relation pg_stext; + SysScanDesc scan; + ScanKeyData key; + HeapTuple tuple; + + /* Open pg_statistic_ext and scan for entries on referenced_rel */ + pg_stext = table_open(StatisticExtRelationId, AccessShareLock); + + ScanKeyInit(&key, + Anum_pg_statistic_ext_stxrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(referenced_rel)); + + scan = systable_beginscan(pg_stext, StatisticExtRelidIndexId, true, + NULL, 1, &key); + + while (HeapTupleIsValid(tuple = systable_getnext(scan))) + { + Form_pg_statistic_ext stat = (Form_pg_statistic_ext) GETSTRUCT(tuple); + MVDependencies *dependencies; + int i; + + /* Check if this stats object has dependencies */ + if (!statext_is_kind_built(tuple, STATS_EXT_DEPENDENCIES)) + continue; + + /* Load dependencies */ + dependencies = statext_dependencies_load(stat->oid, false); + if (!dependencies) + continue; + + /* Scan each dependency for pattern: referenced_attr → other_col */ + for (i = 0; i < dependencies->ndeps; i++) + { + MVDependency *dep = dependencies->deps[i]; + + /* + * Check if this is a simple dependency (2 attributes) where the + * first attribute is our referenced_attr + */ + if (dep->nattributes == 2 && + dep->attributes[0] == referenced_attr) + { + AttrNumber dependent_attr = dep->attributes[1]; + + /* Add to list if not already there */ + if (!list_member_int(dependents, dependent_attr)) + dependents = lappend_int(dependents, dependent_attr); + } + } + + pfree(dependencies); + } + + systable_endscan(scan); + table_close(pg_stext, AccessShareLock); + + return dependents; +} + +/* + * build_implicit_join_mcv_stats_from_fk + * Build implicit FK-based join MCV statistics for the relation being analyzed + */ +static void +build_implicit_join_mcv_stats_from_fk(Relation onerel, List *statslist, + Relation pg_stext, bool inh, + int numrows, HeapTuple *rows, + int natts, VacAttrStats **vacattrstats) +{ + List *candidates; + ListCell *lc_cand; + + elog(DEBUG1, "Join mcv stats: Starting FK-based candidate detection for relation %s", + RelationGetRelationName(onerel)); + + /* Detect candidates based on FK constraints and functional dependencies */ + candidates = detect_join_stats_candidates(onerel); + + foreach(lc_cand, candidates) + { + FKJoinStatsCandidate *candidate = (FKJoinStatsCandidate *) lfirst(lc_cand); + JoinMCVList *join_mcv; + Oid stat_oid; + int2vector *joinkeys; + int2vector *filter_attnums; + int16 joinkeys_array[2]; + int16 filter_array[1]; + + elog(DEBUG1, "Join mcv stats: Checking FK candidate for referencing_rel=%u attr=%d, " + "referenced_rel=%u attr=%d, filter_attr=%d", + candidate->referencing_rel, candidate->referencing_attr, + candidate->referenced_rel, candidate->referenced_attr, + candidate->filter_attr); + + /* + * Skip stat that already exists (either manually created or from a + * previous ANALYZE). + */ + if (statext_join_mcv_exists(statslist, + candidate->referencing_rel, + candidate->referencing_attr, + candidate->referenced_rel, + candidate->referenced_attr, + candidate->filter_attr)) + { + elog(DEBUG1, "Join mcv stats: Stat already exists, skipping FK candidate"); + continue; + } + + elog(DEBUG1, "Join mcv stats: Creating new implicit stat for FK candidate"); + + /* Create implicit statistics object first to get stat_oid */ + stat_oid = create_implicit_join_stat(pg_stext, + candidate->referencing_rel, + candidate->referencing_attr, + candidate->referenced_rel, + candidate->referenced_attr, + candidate->filter_attr); + + /* Construct joinkeys: [referencing_attr, referenced_attr] */ + joinkeys_array[0] = candidate->referencing_attr; + joinkeys_array[1] = candidate->referenced_attr; + joinkeys = buildint2vector(joinkeys_array, 2); + + /* Construct filter_attnums: [filter_attr] */ + filter_array[0] = candidate->filter_attr; + filter_attnums = buildint2vector(filter_array, 1); + + join_mcv = statext_join_mcv_build(stat_oid, + candidate->referencing_rel, + candidate->referenced_rel, + joinkeys, + filter_attnums, + numrows, + rows, + natts, + vacattrstats); + + elog(DEBUG1, "Join mcv stats: Built MCV list with %d items", + join_mcv ? join_mcv->nitems : 0); + + if (!join_mcv) + continue; + + statext_store(stat_oid, inh, + NULL, NULL, NULL, (Datum) 0, NULL, + join_mcv); + + pfree(join_mcv); + } + + list_free_deep(candidates); +} + +/* + * detect_join_stats_candidates + * Scan foreign key constraints on the analyzed table and identify + * candidates for join MCV statistics collection. + * + * For each FK constraint: + * 1. Get the FK column (referencing side) + * 2. Get the referenced table and column + * 3. Find columns in referenced table with functional dependencies + * 4. Create a candidate for each dependent column + * + * Returns: List of FKJoinStatsCandidate structs + */ +static List * +detect_join_stats_candidates(Relation rel) +{ + List *candidates = NIL; + Relation pg_constraint; + SysScanDesc scan; + ScanKeyData key; + HeapTuple tuple; + + /* Open pg_constraint and scan for FK constraints on this table */ + pg_constraint = table_open(ConstraintRelationId, AccessShareLock); + + ScanKeyInit(&key, + Anum_pg_constraint_conrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationGetRelid(rel))); + + scan = systable_beginscan(pg_constraint, ConstraintRelidTypidNameIndexId, + true, NULL, 1, &key); + + while (HeapTupleIsValid(tuple = systable_getnext(scan))) + { + Form_pg_constraint con = (Form_pg_constraint) GETSTRUCT(tuple); + Datum conkey_datum; + Datum confkey_datum; + bool isnull; + ArrayType *conkey_array; + ArrayType *confkey_array; + int numfks; + AttrNumber *conkey; + AttrNumber *confkey; + List *dependents; + ListCell *lc; + int i; + + /* Only interested in foreign key constraints */ + if (con->contype != CONSTRAINT_FOREIGN) + continue; + + /* Get the FK column numbers */ + conkey_datum = heap_getattr(tuple, Anum_pg_constraint_conkey, + RelationGetDescr(pg_constraint), &isnull); + Assert(!isnull); + conkey_array = DatumGetArrayTypeP(conkey_datum); + + /* Get the referenced column numbers */ + confkey_datum = heap_getattr(tuple, Anum_pg_constraint_confkey, + RelationGetDescr(pg_constraint), &isnull); + Assert(!isnull); + confkey_array = DatumGetArrayTypeP(confkey_datum); + + /* Decode the arrays */ + numfks = ARR_DIMS(conkey_array)[0]; + conkey = (AttrNumber *) ARR_DATA_PTR(conkey_array); + confkey = (AttrNumber *) ARR_DATA_PTR(confkey_array); + + /* + * For each FK column, find functional dependents in referenced table + * TODO: For now we only handle single-column FKs + */ + for (i = 0; i < numfks; i++) + { + AttrNumber referencing_attr = conkey[i]; + AttrNumber referenced_attr = confkey[i]; + Oid referenced_rel = con->confrelid; + + /* Find columns dependent on the referenced column */ + dependents = get_functional_dependents(referenced_rel, referenced_attr); + + /* Create a candidate for each dependent column */ + foreach(lc, dependents) + { + AttrNumber filter_attr = lfirst_int(lc); + FKJoinStatsCandidate *candidate; + + candidate = (FKJoinStatsCandidate *) palloc(sizeof(FKJoinStatsCandidate)); + candidate->referencing_rel = RelationGetRelid(rel); + candidate->referencing_attr = referencing_attr; + candidate->referenced_rel = referenced_rel; + candidate->referenced_attr = referenced_attr; + candidate->filter_attr = filter_attr; + + candidates = lappend(candidates, candidate); + } + + list_free(dependents); + } + } + + systable_endscan(scan); + table_close(pg_constraint, AccessShareLock); + + return candidates; +} + +/* + * create_implicit_join_stat + * Create an implicit pg_statistic_ext entry for join MCV statistics + * + * This creates a statistics object in pg_statistic_ext to hold the + * join MCV data. The object is "implicit" in the sense that it's + * automatically created by ANALYZE, not explicitly via CREATE STATISTICS. + * + * Returns the OID of the created statistics object, or InvalidOid if creation + * failed or an equivalent object already exists. + */ +static Oid +create_implicit_join_stat(Relation pg_stext, + Oid referencing_rel, + AttrNumber referencing_attr, + Oid referenced_rel, + AttrNumber referenced_attr, + AttrNumber filter_attr) +{ + NameData stat_name; + Oid stat_oid; + Datum values[Natts_pg_statistic_ext]; + bool nulls[Natts_pg_statistic_ext]; + HeapTuple htup; + Datum kinds[1]; + ArrayType *stxkind; + int16 attnums[1]; + ArrayType *stxkeys; + char namebuf[NAMEDATALEN]; + HeapTuple classtuple; + Form_pg_class classform; + int16 joinkeys[2]; + int2vector *jk; + Oid stxowner; + ObjectAddress myself; + ObjectAddress parentobject; + + /* + * Generate a name for this statistics object in format: + * "_join_mcv_____" + */ + snprintf(namebuf, NAMEDATALEN, "_join_mcv_%u_%d_%u_%d_%d", + referencing_rel, referencing_attr, + referenced_rel, referenced_attr, filter_attr); + namestrcpy(&stat_name, namebuf); + + /* + * Check if a statistics object with this name already exists in this + * namespace. If so, we'll reuse it rather than creating a duplicate. + */ + stat_oid = GetSysCacheOid2(STATEXTNAMENSP, Anum_pg_statistic_ext_oid, + PointerGetDatum(&stat_name), + ObjectIdGetDatum(get_rel_namespace(referencing_rel))); + if (OidIsValid(stat_oid)) + return stat_oid; + + /* Create new statistics object */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + + /* Generate new OID */ + stat_oid = GetNewOidWithIndex(pg_stext, StatisticExtOidIndexId, + Anum_pg_statistic_ext_oid); + + values[Anum_pg_statistic_ext_oid - 1] = ObjectIdGetDatum(stat_oid); + values[Anum_pg_statistic_ext_stxrelid - 1] = ObjectIdGetDatum(referencing_rel); + values[Anum_pg_statistic_ext_stxname - 1] = NameGetDatum(&stat_name); + values[Anum_pg_statistic_ext_stxnamespace - 1] = + ObjectIdGetDatum(get_rel_namespace(referencing_rel)); + + /* Get owner from pg_class */ + classtuple = SearchSysCache1(RELOID, ObjectIdGetDatum(referencing_rel)); + if (!HeapTupleIsValid(classtuple)) + elog(ERROR, "cache lookup failed for relation %u", referencing_rel); + classform = (Form_pg_class) GETSTRUCT(classtuple); + stxowner = classform->relowner; + values[Anum_pg_statistic_ext_stxowner - 1] = ObjectIdGetDatum(stxowner); + ReleaseSysCache(classtuple); + + /* stxkeys contains the filter column (from referenced table) */ + attnums[0] = filter_attr; + stxkeys = construct_array_builtin((Datum *) attnums, 1, INT2OID); + values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys); + + /* stxkind contains only STATS_EXT_JOIN_MCV */ + kinds[0] = CharGetDatum(STATS_EXT_JOIN_MCV); + stxkind = construct_array_builtin(kinds, 1, CHAROID); + values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind); + + /* stxstattarget is NULL (use default) */ + nulls[Anum_pg_statistic_ext_stxstattarget - 1] = true; + + /* stxexprs is NULL (no expressions) */ + nulls[Anum_pg_statistic_ext_stxexprs - 1] = true; + + /* stxotherrel - the referenced/orther table */ + values[Anum_pg_statistic_ext_stxotherrel - 1] = ObjectIdGetDatum(referenced_rel); + + /* stxjoinkeys - join column pairs: [referencing_attr, referenced_attr] */ + joinkeys[0] = referencing_attr; + joinkeys[1] = referenced_attr; + jk = buildint2vector(joinkeys, 2); + values[Anum_pg_statistic_ext_stxjoinkeys - 1] = PointerGetDatum(jk); + + /* Insert the tuple */ + htup = heap_form_tuple(RelationGetDescr(pg_stext), values, nulls); + CatalogTupleInsert(pg_stext, htup); + heap_freetuple(htup); + + /* + * Add dependencies on columns used in the stats, so that the stats object + * goes away if any or all of them get dropped. + */ + ObjectAddressSet(myself, StatisticExtRelationId, stat_oid); + + /* Dependency on filter column (from referenced table) */ + ObjectAddressSubSet(parentobject, RelationRelationId, + referenced_rel, filter_attr); + recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO); + + /* Dependencies on join columns from both tables */ + ObjectAddressSubSet(parentobject, RelationRelationId, + referencing_rel, referencing_attr); + recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO); + + ObjectAddressSubSet(parentobject, RelationRelationId, + referenced_rel, referenced_attr); + recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO); + + /* + * Also add dependencies on namespace and owner. These are required + * because the stats object might have a different namespace and/or owner + * than the underlying table(s). + */ + ObjectAddressSet(parentobject, NamespaceRelationId, + get_rel_namespace(referencing_rel)); + recordDependencyOn(&myself, &parentobject, DEPENDENCY_NORMAL); + + recordDependencyOnOwner(StatisticExtRelationId, stat_oid, stxowner); + + return stat_oid; +} diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h index 982d1babaa8..6a9668b13ec 100644 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -68,6 +68,17 @@ typedef struct StatsBuildData bool **nulls; } StatsBuildData; +/* candidate for implicit foreign key join MCV statistics collection */ +typedef struct FKJoinStatsCandidate +{ + Oid referencing_rel; + AttrNumber referencing_attr; /* FK column in referencing table */ + Oid referenced_rel; + AttrNumber referenced_attr; /* PK column in referenced table */ + AttrNumber filter_attr; /* filter column in referenced table + * (dependent on referenced_attr) */ +} FKJoinStatsCandidate; + extern MVNDistinct *statext_ndistinct_build(double totalrows, StatsBuildData *data); extern bytea *statext_ndistinct_serialize(MVNDistinct *ndistinct); extern MVNDistinct *statext_ndistinct_deserialize(bytea *data); diff --git a/src/test/regress/expected/stats_ext_crossrel.out b/src/test/regress/expected/stats_ext_crossrel.out index d41dd37db3f..2be46c87a16 100644 --- a/src/test/regress/expected/stats_ext_crossrel.out +++ b/src/test/regress/expected/stats_ext_crossrel.out @@ -283,6 +283,255 @@ CREATE STATISTICS bad_stats5 (mcv) ON lower(k.keyword) FROM movie_keywords2 mk J ERROR: expressions are not supported in join statistics CREATE STATISTICS bad_stats6 (mcv) ON k.keyword FROM (movie_keywords2 mk JOIN keywords2 k ON (mk.keyword_id = k.id)) JOIN keywords2 k2 ON (k.id = k2.id); ERROR: extended join statistics currently support only simple two-table joins +-- +-- Test automatic collection of join MCV statistics for FK joins +-- with functional dependencies. +-- +CREATE TABLE keywords ( + id INTEGER PRIMARY KEY, + keyword TEXT NOT NULL, + phonetic_code character varying(5) +); +CREATE TABLE movie_keywords ( + movie_id INTEGER PRIMARY KEY, + keyword_id INTEGER NOT NULL REFERENCES keywords(id) +); +-- Insert tightly correlated data into the "referenced" table +INSERT INTO keywords (id, keyword, phonetic_code) +SELECT + i, + 'keyword_' || i, + 'ph_' || i +FROM generate_series(1, 50) i; +-- Insert data into the referencing table with skewed distribution +INSERT INTO movie_keywords (movie_id, keyword_id) +SELECT + i, + CASE + WHEN i % 100 < 60 THEN (i % 10) + 1 -- 60% keyword_ids 1-10 (6% frequency per keyword) + WHEN i % 100 < 90 THEN (i % 10) + 11 -- 30% keyword_ids 11-20 (3% frequency per keyword) + ELSE (i % 10) + 21 -- 10% keyword_ids 21-30 (1% frequency per keyword) + END +FROM generate_series(1, 10000) i; +-- Create functional dependency statistics on the referenced table +CREATE STATISTICS keywords_deps_stat (dependencies) ON id, keyword, phonetic_code FROM keywords; +ANALYZE keywords; +-- Analyze the referencing table to trigger join MCV collection +ANALYZE movie_keywords; +-- Show the join statistics +SELECT s.stxrelid::regclass, + s.stxotherrel::regclass, + s.stxjoinkeys, + s.stxkeys, + s.stxkind, + s.stxstattarget, + s.stxexprs +FROM pg_statistic_ext s +WHERE s.stxrelid = 'movie_keywords'::regclass +ORDER BY s.oid; + stxrelid | stxotherrel | stxjoinkeys | stxkeys | stxkind | stxstattarget | stxexprs +----------------+-------------+-------------+---------+---------+---------------+---------- + movie_keywords | keywords | 2 1 | 2 | {c} | | + movie_keywords | keywords | 2 1 | 3 | {c} | | +(2 rows) + +-- Note: the MCV list items only store the filter values (join values are implicit) +SELECT m.index, + m.values, + m.nulls, + ROUND(m.frequency::numeric, 2) AS frequency +FROM pg_statistic_ext s + JOIN pg_statistic_ext_data d ON (s.oid = d.stxoid) + CROSS JOIN LATERAL pg_join_mcv_list_items(d.stxdjoinmcv) AS m +WHERE s.stxrelid = 'movie_keywords'::regclass + AND 'c' = ANY(s.stxkind) +ORDER BY s.stxname, m.index; + index | values | nulls | frequency +-------+--------------+-------+----------- + 0 | {keyword_1} | {f} | 0.06 + 1 | {keyword_2} | {f} | 0.06 + 2 | {keyword_3} | {f} | 0.06 + 3 | {keyword_4} | {f} | 0.06 + 4 | {keyword_5} | {f} | 0.06 + 5 | {keyword_6} | {f} | 0.06 + 6 | {keyword_7} | {f} | 0.06 + 7 | {keyword_8} | {f} | 0.06 + 8 | {keyword_9} | {f} | 0.06 + 9 | {keyword_10} | {f} | 0.06 + 10 | {keyword_11} | {f} | 0.03 + 11 | {keyword_12} | {f} | 0.03 + 12 | {keyword_13} | {f} | 0.03 + 13 | {keyword_14} | {f} | 0.03 + 14 | {keyword_15} | {f} | 0.03 + 15 | {keyword_16} | {f} | 0.03 + 16 | {keyword_17} | {f} | 0.03 + 17 | {keyword_18} | {f} | 0.03 + 18 | {keyword_19} | {f} | 0.03 + 19 | {keyword_20} | {f} | 0.03 + 20 | {keyword_21} | {f} | 0.01 + 21 | {keyword_22} | {f} | 0.01 + 22 | {keyword_23} | {f} | 0.01 + 23 | {keyword_24} | {f} | 0.01 + 24 | {keyword_25} | {f} | 0.01 + 25 | {keyword_26} | {f} | 0.01 + 26 | {keyword_27} | {f} | 0.01 + 27 | {keyword_28} | {f} | 0.01 + 28 | {keyword_29} | {f} | 0.01 + 29 | {keyword_30} | {f} | 0.01 + 0 | {ph_1} | {f} | 0.06 + 1 | {ph_2} | {f} | 0.06 + 2 | {ph_3} | {f} | 0.06 + 3 | {ph_4} | {f} | 0.06 + 4 | {ph_5} | {f} | 0.06 + 5 | {ph_6} | {f} | 0.06 + 6 | {ph_7} | {f} | 0.06 + 7 | {ph_8} | {f} | 0.06 + 8 | {ph_9} | {f} | 0.06 + 9 | {ph_10} | {f} | 0.06 + 10 | {ph_11} | {f} | 0.03 + 11 | {ph_12} | {f} | 0.03 + 12 | {ph_13} | {f} | 0.03 + 13 | {ph_14} | {f} | 0.03 + 14 | {ph_15} | {f} | 0.03 + 15 | {ph_16} | {f} | 0.03 + 16 | {ph_17} | {f} | 0.03 + 17 | {ph_18} | {f} | 0.03 + 18 | {ph_19} | {f} | 0.03 + 19 | {ph_20} | {f} | 0.03 + 20 | {ph_21} | {f} | 0.01 + 21 | {ph_22} | {f} | 0.01 + 22 | {ph_23} | {f} | 0.01 + 23 | {ph_24} | {f} | 0.01 + 24 | {ph_25} | {f} | 0.01 + 25 | {ph_26} | {f} | 0.01 + 26 | {ph_27} | {f} | 0.01 + 27 | {ph_28} | {f} | 0.01 + 28 | {ph_29} | {f} | 0.01 + 29 | {ph_30} | {f} | 0.01 +(60 rows) + +-- Ensure the join MCV statistics are used for single equality predicates +-- on the filter column of the referenced table +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.keyword = ''keyword_1'' AND k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 600 | 600 +(1 row) + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.keyword = ''keyword_15'' AND k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 300 | 300 +(1 row) + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.keyword = ''keyword_25'' AND k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.phonetic_code = ''ph_1'' AND k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 600 | 600 +(1 row) + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.phonetic_code = ''ph_15'' AND k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 300 | 300 +(1 row) + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.phonetic_code = ''ph_25'' AND k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 100 | 100 +(1 row) + +-- No filter on referenced table, should not use join stats +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 10000 | 10000 +(1 row) + +-- OR Predicates (currently NOT supported by join MCV stats) +-- Expected: Will likely underestimate +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE (k.keyword = ''keyword_1'' OR k.keyword = ''keyword_2'') + AND k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 400 | 1200 +(1 row) + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE (k.keyword = ''keyword_1'' OR k.keyword = ''keyword_15'') + AND k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 400 | 900 +(1 row) + +-- Ensure the join MCV statistics are used for IN predicates +-- on the filter column of the referenced table +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.keyword IN (''keyword_1'', ''keyword_2'', ''keyword_3'') + AND k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 1800 | 1800 +(1 row) + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.keyword IN (''keyword_1'', ''keyword_15'', ''keyword_25'') + AND k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 1000 | 1000 +(1 row) + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.phonetic_code IN (''ph_1'', ''ph_15'') + AND k.id = mk.keyword_id +'); + estimated | actual +-----------+-------- + 900 | 900 +(1 row) + -- Cleanup DROP TABLE movie_keywords2 CASCADE; DROP TABLE keywords2 CASCADE; +DROP TABLE movie_keywords CASCADE; +DROP TABLE keywords CASCADE; diff --git a/src/test/regress/sql/stats_ext_crossrel.sql b/src/test/regress/sql/stats_ext_crossrel.sql index b771a30f34d..4ba9a9a926c 100644 --- a/src/test/regress/sql/stats_ext_crossrel.sql +++ b/src/test/regress/sql/stats_ext_crossrel.sql @@ -167,6 +167,146 @@ CREATE STATISTICS bad_stats4 (mcv) ON keyword FROM movie_keywords2 mk JOIN keywo CREATE STATISTICS bad_stats5 (mcv) ON lower(k.keyword) FROM movie_keywords2 mk JOIN keywords2 k ON (mk.keyword_id = k.id); CREATE STATISTICS bad_stats6 (mcv) ON k.keyword FROM (movie_keywords2 mk JOIN keywords2 k ON (mk.keyword_id = k.id)) JOIN keywords2 k2 ON (k.id = k2.id); +-- +-- Test automatic collection of join MCV statistics for FK joins +-- with functional dependencies. +-- + +CREATE TABLE keywords ( + id INTEGER PRIMARY KEY, + keyword TEXT NOT NULL, + phonetic_code character varying(5) +); + +CREATE TABLE movie_keywords ( + movie_id INTEGER PRIMARY KEY, + keyword_id INTEGER NOT NULL REFERENCES keywords(id) +); + +-- Insert tightly correlated data into the "referenced" table +INSERT INTO keywords (id, keyword, phonetic_code) +SELECT + i, + 'keyword_' || i, + 'ph_' || i +FROM generate_series(1, 50) i; + +-- Insert data into the referencing table with skewed distribution +INSERT INTO movie_keywords (movie_id, keyword_id) +SELECT + i, + CASE + WHEN i % 100 < 60 THEN (i % 10) + 1 -- 60% keyword_ids 1-10 (6% frequency per keyword) + WHEN i % 100 < 90 THEN (i % 10) + 11 -- 30% keyword_ids 11-20 (3% frequency per keyword) + ELSE (i % 10) + 21 -- 10% keyword_ids 21-30 (1% frequency per keyword) + END +FROM generate_series(1, 10000) i; + +-- Create functional dependency statistics on the referenced table +CREATE STATISTICS keywords_deps_stat (dependencies) ON id, keyword, phonetic_code FROM keywords; +ANALYZE keywords; + +-- Analyze the referencing table to trigger join MCV collection +ANALYZE movie_keywords; + +-- Show the join statistics +SELECT s.stxrelid::regclass, + s.stxotherrel::regclass, + s.stxjoinkeys, + s.stxkeys, + s.stxkind, + s.stxstattarget, + s.stxexprs +FROM pg_statistic_ext s +WHERE s.stxrelid = 'movie_keywords'::regclass +ORDER BY s.oid; + +-- Note: the MCV list items only store the filter values (join values are implicit) +SELECT m.index, + m.values, + m.nulls, + ROUND(m.frequency::numeric, 2) AS frequency +FROM pg_statistic_ext s + JOIN pg_statistic_ext_data d ON (s.oid = d.stxoid) + CROSS JOIN LATERAL pg_join_mcv_list_items(d.stxdjoinmcv) AS m +WHERE s.stxrelid = 'movie_keywords'::regclass + AND 'c' = ANY(s.stxkind) +ORDER BY s.stxname, m.index; + +-- Ensure the join MCV statistics are used for single equality predicates +-- on the filter column of the referenced table +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.keyword = ''keyword_1'' AND k.id = mk.keyword_id +'); + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.keyword = ''keyword_15'' AND k.id = mk.keyword_id +'); + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.keyword = ''keyword_25'' AND k.id = mk.keyword_id +'); + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.phonetic_code = ''ph_1'' AND k.id = mk.keyword_id +'); + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.phonetic_code = ''ph_15'' AND k.id = mk.keyword_id +'); + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.phonetic_code = ''ph_25'' AND k.id = mk.keyword_id +'); + +-- No filter on referenced table, should not use join stats +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.id = mk.keyword_id +'); + +-- OR Predicates (currently NOT supported by join MCV stats) +-- Expected: Will likely underestimate +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE (k.keyword = ''keyword_1'' OR k.keyword = ''keyword_2'') + AND k.id = mk.keyword_id +'); + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE (k.keyword = ''keyword_1'' OR k.keyword = ''keyword_15'') + AND k.id = mk.keyword_id +'); + +-- Ensure the join MCV statistics are used for IN predicates +-- on the filter column of the referenced table +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.keyword IN (''keyword_1'', ''keyword_2'', ''keyword_3'') + AND k.id = mk.keyword_id +'); + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.keyword IN (''keyword_1'', ''keyword_15'', ''keyword_25'') + AND k.id = mk.keyword_id +'); + +SELECT * FROM check_estimated_rows(' + SELECT * FROM movie_keywords mk, keywords k + WHERE k.phonetic_code IN (''ph_1'', ''ph_15'') + AND k.id = mk.keyword_id +'); + -- Cleanup DROP TABLE movie_keywords2 CASCADE; DROP TABLE keywords2 CASCADE; +DROP TABLE movie_keywords CASCADE; +DROP TABLE keywords CASCADE; -- 2.50.1 (Apple Git-155)