>From 4881b97548ed6fa8acbef153562da617ea7e58cb Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Fri, 16 Jan 2015 22:33:41 +0100
Subject: [PATCH 2/4] clause reduction using functional dependencies

During planning, use functional dependencies to decide
which clauses to skip during cardinality estimation.
Initial and rather simplistic implementation.

FIX: second part of the rename to functional dependencies
FIX: don't build functional dependencies by default
FIX: build deps only when requested
FIX: use treat_as_join_clause() in clause_is_mv_compatible()

We don't want to process clauses that are used for joining,
but only simple WHERE clauses.

FIX: use planner_rt_fetch() to identify relation

The clause_is_mv_compatible() needs to identify the relation
(so that we can fetch the list of multivariate stats by OID).
planner_rt_fetch() seems like the appropriate way to get the
relation OID, but apparently it only works with simple vars.
Maybe examine_variable() would make this work with more
complex vars too?

FIX: comment about functional dependencies and transitivity
FIX: comment about multi-column functional dependencies
FIX: test: functional dependencies / ANALYZE

Test analyzing functional dependencies (part of ANALYZE)
on several datasets (no dependencies, no transitive
dependencies, ...).

FIX: test: clause reduction using function dependencies / EXPLAIN

Checks that a query with conditions on two columns, where one (B)
is functionally dependent on the other one (A), correctly ignores
the clause on (B) and chooses bitmap index scan instead of plain
index scan (which is what happens otherwise, thanks to assumption
of independence).

FIX: comment about building multi-column dependencies (TODO)
FIX: support functional dependencies on all data types

Until now build_mv_dependencies() only supported data types
passed by value (i.e. not varlena types or types passed by
reference). This commit adds support for these data types
by using SortSupport to do the sorting.

This however keeps the 'typbyval' assert in common.c:

    Assert(stats[i]->attrtype->typbyval);

as that method is used for all types of multivariate stats
and we don't want to make this work for all of them. If
you want to play with functional dependencies on columns
with such data types, comment this assert out.

FIX: support NULL values in functional dependencies
FIX: typo in regression test of functional dependencies
FIX: added regression test for functional dependencies with TEXT
FIX: rework build_mv_dependencies() not to fail with mixed columns
FIX: readability improvement in build_mv_dependencies()
FIX: readability fixes in build_mv_dependencies()
FIX: regression test - dependencies with mix of data types / NULLs
FIX: minor formatting fixes in build_mv_dependencies()
FIX: comment about efficient building of multi-column dependencies
FIX: comment about proper NULL handling in build_mv_dependencies()
FIX: minor comment in build_mv_dependencies()
FIX: comment about handling NULLs like regular values (dependencies)
FIX: explanation of allocations in build_mv_dependencies()
FIX: move multisort typedefs/functions to common.h/c
FIX: check that at least some statistics were requested (dependencies)
FIX: comment about handling NULL values in dependencies
FIX: minor improvements in mvstat.h (functional dependencies)
FIX: add regression test for ADD STATISTICS options (dependencies)
FIX: comment about multivariate stats at clauselist_selectivity()
FIX: updated comments in clausesel.c (dependencies)
FIX: note in clauselist_selectivity()
FIX: fixed typo in tablecmds.c (comma -> semicolon)
FIX: make regression tests parallel-happy (functional dependencies)
---
 src/backend/commands/tablecmds.c              |   8 +-
 src/backend/optimizer/path/clausesel.c        | 476 +++++++++++++++++++++++++-
 src/backend/utils/mvstats/common.c            |  86 ++++-
 src/backend/utils/mvstats/common.h            |  22 ++
 src/backend/utils/mvstats/dependencies.c      | 170 +++++++--
 src/include/utils/mvstats.h                   |  23 +-
 src/test/regress/expected/mv_dependencies.out | 175 ++++++++++
 src/test/regress/parallel_schedule            |   3 +
 src/test/regress/serial_schedule              |   1 +
 src/test/regress/sql/mv_dependencies.sql      | 153 +++++++++
 10 files changed, 1076 insertions(+), 41 deletions(-)
 create mode 100644 src/test/regress/expected/mv_dependencies.out
 create mode 100644 src/test/regress/sql/mv_dependencies.sql

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 3ec1a5a..3c82b89 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -11651,7 +11651,7 @@ static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
 	Relation	mvstatrel;
 
 	/* by default build everything */
-	bool 	build_dependencies = true;
+	bool 	build_dependencies = false;
 
 	Assert(IsA(def, StatisticsDef));
 
@@ -11713,6 +11713,12 @@ static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
 							opt->defname)));
 	}
 
+	/* check that at least some statistics were requested */
+	if (! build_dependencies)
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("no statistics type (dependencies) was requested")));
+
 	/* sort the attnums and build int2vector */
 	qsort(attnums, numcols, sizeof(int16), compare_int16);
 	stakeys = buildint2vector(attnums, numcols);
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index dcac1c1..36e5bce 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -24,6 +24,12 @@
 #include "utils/lsyscache.h"
 #include "utils/selfuncs.h"
 
+#include "utils/mvstats.h"
+#include "catalog/pg_collation.h"
+#include "utils/typcache.h"
+
+#include "parser/parsetree.h"
+
 
 /*
  * Data structure for accumulating info about possible range-query
@@ -43,6 +49,16 @@ static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
 			   bool varonleft, bool isLTsel, Selectivity s2);
 
 
+static bool clause_is_mv_compatible(PlannerInfo *root, Node *clause, Oid varRelid,
+							 Oid *relid, AttrNumber *attnum, SpecialJoinInfo *sjinfo);
+
+static Bitmapset  *collect_mv_attnums(PlannerInfo *root, List *clauses,
+									  Oid varRelid, Oid *relid, SpecialJoinInfo *sjinfo);
+
+static List *clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
+								Oid varRelid, int nmvstats, MVStats mvstats,
+								SpecialJoinInfo *sjinfo);
+
 /****************************************************************************
  *		ROUTINES TO COMPUTE SELECTIVITIES
  ****************************************************************************/
@@ -61,7 +77,7 @@ static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
  * subclauses.  However, that's only right if the subclauses have independent
  * probabilities, and in reality they are often NOT independent.  So,
  * we want to be smarter where we can.
-
+ *
  * Currently, the only extra smarts we have is to recognize "range queries",
  * such as "x > 34 AND x < 42".  Clauses are recognized as possible range
  * query components if they are restriction opclauses whose operators have
@@ -88,6 +104,76 @@ static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
  *
  * Of course this is all very dependent on the behavior of
  * scalarltsel/scalargtsel; perhaps some day we can generalize the approach.
+ *
+ *
+ * Multivariate statististics
+ * --------------------------
+ * This also uses multivariate stats to estimate combinations of conditions,
+ * in a way attempting to minimize the overhead when there are no suitable
+ * multivariate stats.
+ *
+ * The following checks are performed (in this order), and the optimizer
+ * falls back to regular stats on the first 'false'.
+ *
+ * NOTE: This explains how this works with all the patches applied, not
+ *       just the functional dependencies.
+ *
+ * (1) check that at least two columns are referenced from conditions
+ *     compatible with multivariate stats
+ *
+ *     If there are no conditions that might be handled by multivariate
+ *     stats, or if the conditions reference just a single column, it
+ *     makes no sense to use multivariate stats.
+ *
+ *     What conditions are compatible with multivariate stats is decided
+ *     by clause_is_mv_compatible(). At this moment, only simple conditions
+ *     of the form "column operator constant" (for simple comparison
+ *     operators), and IS NULL / IS NOT NULL are considered compatible
+ *     with multivariate statistics.
+ *
+ * (2) reduce the clauses using functional dependencies
+ *
+ *     This simply attempts to 'reduce' the clauses by applying functional
+ *     dependencies. For example if there are two clauses:
+ *
+ *         WHERE (a = 1) AND (b = 2)
+ *
+ *     and we know that 'a' determines the value of 'b', we may remove
+ *     the second condition (b = 2) when computing the selectivity.
+ *     This is of course tricky - see mvstats/dependencies.c for details.
+ *
+ *     After the reduction, step (1) is to be repeated.
+ *
+ * (3) check if there are multivariate stats built on the columns
+ *
+ *     If there are no multivariate statistics, we have to fall back to
+ *     the regular stats. We might perform checks (1) and (2) in reverse
+ *     order, i.e. first check if there are multivariate statistics and
+ *     then collect the attributes only if needed. The assumption is
+ *     that checking the clauses is cheaper than querying the catalog,
+ *     so this check is performed first.
+ *
+ * (4) choose the stats matching the most columns (at least two)
+ *
+ *     If there are multiple instances of multivariate statistics (e.g.
+ *     built on different sets of columns), we choose the stats covering
+ *     the most columns from step (1). It may happen that all available
+ *     stats match just a single column - for example with conditions
+ *
+ *         WHERE a = 1 AND b = 2
+ *
+ *     and statistics built on (a,c) and (b,c). In such case just fall
+ *     back to the regular stats because it makes no sense to use the
+ *     multivariate statistics.
+ *
+ *     This selection criteria (the most columns) is certainly very
+ *     simple and definitely not optimal - it's simple to come up with
+ *     examples where other approaches work better. More about this
+ *     at choose_mv_statistics().
+ *
+ * (5) use the multivariate stats to estimate matching clauses
+ *
+ * (6) estimate the remaining clauses using the regular statistics
  */
 Selectivity
 clauselist_selectivity(PlannerInfo *root,
@@ -100,6 +186,14 @@ clauselist_selectivity(PlannerInfo *root,
 	RangeQueryClause *rqlist = NULL;
 	ListCell   *l;
 
+	/* processing mv stats */
+	Oid			relid = InvalidOid;
+	int			nmvstats = 0;
+	MVStats		mvstats = NULL;
+
+	/* attributes in mv-compatible clauses */
+	Bitmapset  *mvattnums = NULL;
+
 	/*
 	 * If there's exactly one clause, then no use in trying to match up pairs,
 	 * so just go directly to clause_selectivity().
@@ -108,6 +202,28 @@ clauselist_selectivity(PlannerInfo *root,
 		return clause_selectivity(root, (Node *) linitial(clauses),
 								  varRelid, jointype, sjinfo);
 
+	/* collect attributes referenced by mv-compatible clauses */
+	mvattnums = collect_mv_attnums(root, clauses, varRelid, &relid, sjinfo);
+
+	/*
+	 * If there are mv-compatible clauses, referencing at least two
+	 * different columns (otherwise it makes no sense to use mv stats),
+	 * try to reduce the clauses using functional dependencies, and
+	 * recollect the attributes from the reduced list.
+	 *
+	 * We don't need to select a single statistics for this - we can
+	 * apply all the functional dependencies we have.
+	 */
+	if (bms_num_members(mvattnums) >= 2)
+	{
+		/* fetch info from the catalog (not the serialized stats yet) */
+		mvstats = list_mv_stats(relid, &nmvstats, true);
+
+		/* reduce clauses by applying functional dependencies rules */
+		clauses = clauselist_apply_dependencies(root, clauses, varRelid,
+												nmvstats, mvstats, sjinfo);
+	}
+
 	/*
 	 * Initial scan over clauses.  Anything that doesn't look like a potential
 	 * rangequery clause gets multiplied into s1 and forgotten. Anything that
@@ -782,3 +898,361 @@ clause_selectivity(PlannerInfo *root,
 
 	return s1;
 }
+
+/*
+ * Collect attributes from mv-compatible clauses.
+ * 
+ */
+static Bitmapset *
+collect_mv_attnums(PlannerInfo *root, List *clauses, Oid varRelid,
+				   Oid *relid, SpecialJoinInfo *sjinfo)
+{
+	Bitmapset  *attnums = NULL;
+	ListCell   *l;
+
+	/*
+	 * Walk through the clauses and identify the ones we can estimate
+	 * using multivariate stats, and remember the relid/columns. We'll
+	 * then cross-check if we have suitable stats, and only if needed
+	 * we'll split the clauses into multivariate and regular lists.
+	 *
+	 * For now we're only interested in RestrictInfo nodes with nested
+	 * OpExpr, using either a range or equality.
+	 */
+	foreach (l, clauses)
+	{
+		AttrNumber attnum;
+		Node	   *clause = (Node *) lfirst(l);
+
+		/* ignore the result for now - we only need the info */
+		if (clause_is_mv_compatible(root, clause, varRelid, relid, &attnum, sjinfo))
+			attnums = bms_add_member(attnums, attnum);
+	}
+
+	/*
+	 * If there are not at least two attributes referenced by the clause(s),
+	 * we can throw everything out (as we'll revert to simple stats).
+	 */
+	if (bms_num_members(attnums) <= 1)
+	{
+		if (attnums != NULL)
+			pfree(attnums);
+		attnums = NULL;
+		*relid = InvalidOid;
+	}
+
+	return attnums;
+}
+
+/*
+ * Determines whether the clause is compatible with multivariate stats,
+ * and if it is, returns some additional information - varno (index
+ * into simple_rte_array) and a bitmap of attributes. This is then
+ * used to fetch related multivariate statistics.
+ *
+ * At this moment we only support basic conditions of the form
+ * 
+ *     variable OP constant
+ *
+ * where OP is one of [=,<,<=,>=,>] (which is however determined by
+ * looking at the associated function for estimating selectivity, just
+ * like with the single-dimensional case).
+ *
+ * TODO Support 'OR clauses' - shouldn't be all that difficult to
+ *      evaluate them using multivariate stats.
+ */
+static bool
+clause_is_mv_compatible(PlannerInfo *root, Node *clause, Oid varRelid,
+						Oid *relid, AttrNumber *attnum, SpecialJoinInfo *sjinfo)
+{
+
+	if (IsA(clause, RestrictInfo))
+	{
+		RestrictInfo *rinfo = (RestrictInfo *) clause;
+
+		/* Pseudoconstants are not really interesting here. */
+		if (rinfo->pseudoconstant)
+			return false;
+
+		/* no support for OR clauses at this point */
+		if (rinfo->orclause)
+			return false;
+
+		/* get the actual clause from the RestrictInfo (it's not an OR clause) */
+		clause = (Node*)rinfo->clause;
+
+		/* only simple opclauses are compatible with multivariate stats */
+		if (! is_opclause(clause))
+			return false;
+
+		/* we don't support join conditions at this moment */
+		if (treat_as_join_clause(clause, rinfo, varRelid, sjinfo))
+			return false;
+
+		/* is it 'variable op constant' ? */
+		if (list_length(((OpExpr *) clause)->args) == 2)
+		{
+			OpExpr	   *expr = (OpExpr *) clause;
+			bool		varonleft = true;
+			bool		ok;
+
+			ok = (bms_membership(rinfo->clause_relids) == BMS_SINGLETON) &&
+				(is_pseudo_constant_clause_relids(lsecond(expr->args),
+												rinfo->right_relids) ||
+				(varonleft = false,
+				is_pseudo_constant_clause_relids(linitial(expr->args),
+												rinfo->left_relids)));
+
+			if (ok)
+			{
+				RangeTblEntry * rte;
+				Var * var = (varonleft) ? linitial(expr->args) : lsecond(expr->args);
+
+				/*
+				 * Simple variables only - otherwise the planner_rt_fetch seems to fail
+				 * (return NULL).
+				 *
+				 * TODO Maybe use examine_variable() would fix that?
+				 */
+				if (! (IsA(var, Var) && (varRelid == 0 || varRelid == var->varno)))
+					return false;
+
+				/*
+				 * Only consider this variable if (varRelid == 0) or when the varno
+				 * matches varRelid (see explanation at clause_selectivity).
+				 *
+				 * FIXME I suspet this may not be really necessary. The (varRelid == 0)
+				 *       part seems to be enforced by treat_as_join_clause().
+				 */
+				if (! ((varRelid == 0) || (varRelid == var->varno)))
+					return false;
+
+				/* Also skip special varno values, and system attributes ... */
+				if ((IS_SPECIAL_VARNO(var->varno)) || (! AttrNumberIsForUserDefinedAttr(var->varattno)))
+					return false;
+
+				/* Lookup info about the base relation (we need to pass the OID out) */
+				rte = planner_rt_fetch(var->varno, root);
+				*relid = rte->relid;
+
+				/*
+				 * If it's not a "<" or ">" or "=" operator, just ignore the
+				 * clause. Otherwise note the relid and attnum for the variable.
+				 * This uses the function for estimating selectivity, ont the
+				 * operator directly (a bit awkward, but well ...).
+				 */
+				switch (get_oprrest(expr->opno))
+					{
+						case F_SCALARLTSEL:
+						case F_SCALARGTSEL:
+						case F_EQSEL:
+							*attnum = var->varattno;
+							return true;
+					}
+			}
+		}
+	}
+
+	return false;
+
+}
+
+/*
+ * Performs reduction of clauses using functional dependencies, i.e.
+ * removes clauses that are considered redundant. It simply walks
+ * through dependencies, and checks whether the dependency 'matches'
+ * the clauses, i.e. if there's a clause matching the condition. If yes,
+ * all clauses matching the implied part of the dependency are removed
+ * from the list.
+ *
+ * This simply looks at attnums references by the clauses, not at the
+ * type of the operator (equality, inequality, ...). This may not be the
+ * right way to do - it certainly works best for equalities, which is
+ * naturally consistent with functional dependencies (implications).
+ * It's not clear that other operators are handled sensibly - for
+ * example for inequalities, like
+ *
+ *     WHERE (A >= 10) AND (B <= 20)
+ *
+ * and a trivial case where [A == B], resulting in symmetric pair of
+ * rules [A => B], [B => A], it's rather clear we can't remove either of
+ * those clauses.
+ *
+ * That only highlights that functional dependencies are most suitable
+ * for label-like data, where using non-equality operators is very rare.
+ * Using the common city/zipcode example, clauses like
+ *
+ *     (zipcode <= 12345)
+ *
+ * or
+ *
+ *     (cityname >= 'Washington')
+ *
+ * are rare. So restricting the reduction to equality should not harm
+ * the usefulness / applicability.
+ *
+ * The other assumption is that this assumes 'compatible' clauses. For
+ * example by using mismatching zip code and city name, this is unable
+ * to identify the discrepancy and eliminates one of the clauses. The
+ * usual approach (multiplying both selectivities) thus produces a more
+ * accurate estimate, although mostly by luck - the multiplication
+ * comes from assumption of statistical independence of the two
+ * conditions (which is not not valid in this case), but moves the
+ * estimate in the right direction (towards 0%).
+ *
+ * This might be somewhat improved by cross-checking the selectivities
+ * against MCV and/or histogram.
+ *
+ * The implementation needs to be careful about cyclic rules, i.e. rules
+ * like [A => B] and [B => A] at the same time. This must not reduce
+ * clauses on both attributes at the same time.
+ *
+ * Technically we might consider selectivities here too, somehow. E.g.
+ * when (A => B) and (B => A), we might use the clauses with minimum
+ * selectivity.
+ *
+ * TODO Consider restricting the reduction to equality clauses. Or maybe
+ *      use equality classes somehow?
+ *
+ * TODO Merge this docs to dependencies.c, as it's saying mostly the
+ *      same things as the comments there.
+ */
+static List *
+clauselist_apply_dependencies(PlannerInfo *root, List *clauses, Oid varRelid,
+					int nmvstats, MVStats mvstats, SpecialJoinInfo *sjinfo)
+{
+	int i;
+	ListCell *lc;
+	List * reduced_clauses = NIL;
+	Oid	relid;
+
+	/*
+	 * preallocate space for all clauses, including non-mv-compatible,
+	 * so that we don't need to reallocate the arrays repeatedly
+	 */
+	bool	   *reduced   = (bool*)palloc0(list_length(clauses) * sizeof(bool));
+	AttrNumber *mvattnums = (AttrNumber*)palloc0(list_length(clauses) * sizeof(AttrNumber));
+	Node	  **mvclauses = (Node**)palloc0(list_length(clauses) * sizeof(Node*));
+	int nmvclauses = 0;	/* number of mv-compatible clauses */
+
+	/*
+	 * Walk through the clauses - clauses that are not mv-compatible copy
+	 * directly into the result list, and mv-compatible ones store into
+	 * an array of clauses (and remember the attnumb in another array).
+	 */
+	foreach (lc, clauses)
+	{
+		AttrNumber attnum;
+		Node	   *clause = (Node *) lfirst(lc);
+		if (! clause_is_mv_compatible(root, clause, varRelid, &relid, &attnum, sjinfo))
+			lappend(reduced_clauses, clause);
+		else
+		{
+			mvclauses[nmvclauses] = clause;
+			mvattnums[nmvclauses] = attnum;
+			nmvclauses++;
+		}
+	}
+
+	Assert(nmvclauses >= 2);
+
+	/* walk through all the mvstats, and try to apply all the rules */
+	for (i = 0; i < nmvstats; i++)
+	{
+		int j;
+		MVDependencies dependencies = NULL;
+
+		/* skip stats without functional dependencies built */
+		if (! mvstats[i].deps_built)
+			continue;
+
+		/* fetch dependencies */
+		dependencies = deserialize_mv_dependencies(fetch_mv_dependencies(mvstats->mvoid));
+		if (dependencies == NULL)
+			continue;
+
+		/*
+		 * Walk through the dependencies and eliminate all the implied
+		 * clauses, i.e. when there's a rule [A => B], and if we find
+		 * a clause referencing column A (not yet eliminated), eliminate
+		 * all clauses referencing "B".
+		 *
+		 * This is imperfect for a number of reasons. First, this greedy
+		 * approach does not guarantee eliminating the most clauses.
+		 * For example consider dependency [A => B] and [B => A], and
+		 * three clauses referencing A, A and B, i.e. something like
+		 *
+		 *     WHERE (A >= 10) AND (A <= 20) AND (B = 20)
+		 *
+		 * Then by considering the dependency [A => B] a single clause
+		 * on B is eliminated, while by considering [B => A], both
+		 * clauses on A are eliminated.
+		 *
+		 * The order of the dependencies may be either due to ordering
+		 * within a single pg_mv_statistics record, or due to rules
+		 * placed in different records.
+		 *
+		 * Possible solutions:
+		 *
+		 * (a) backtracking/recursion, with tracking of how many clauses
+		 *     were eliminated
+		 *
+		 * (b) building adjacency matrix (where A and B are adjacent
+		 *     when [A => B]), and multiplying it to construct
+		 *     transitive implications. I.e. by having [A=>B] and [B=>C]
+		 *     this also results in [A=>C]. Then we can simply choose
+		 *     the attribute that eliminates the most clauses (and
+		 *     repeat).
+		 *
+		 * We don't expect to have many clauses enough to result in long
+		 * runtimes.
+		 *
+		 * This may also merge all the dependencies, possibly leading to
+		 * longer sequences of transitive dependencies.
+		 *
+		 * E.g. rule [A=>B] in one pg_mv_statistic record and [B=>C] in
+		 * another one results in [A=>C], which can't be deduced if the
+		 * records are considered separately.
+		 */
+		for (j = 0; j < dependencies->ndeps; j++)
+		{
+			int k;
+			bool applicable = false;
+
+			/* are clauses on 'A' already eliminated */
+			for (k = 0; k < nmvclauses; k++)
+			{
+				/* clause on 'A' and not yet eliminated */
+				if ((! reduced[k]) && (mvattnums[k] == dependencies->deps[j]->a))
+				{
+					applicable = true; /* we can apply this rule */
+					break;
+				}
+			}
+
+			/* if the rule is not applicable, skip to the next one */
+			if (! applicable)
+				continue;
+
+			/* eliminate all clauses on 'B ' */
+			for (k = 0; k < nmvclauses; k++)
+			{
+				if (mvattnums[k] == dependencies->deps[j]->b)
+					reduced[k] = true;
+			}
+		}
+	}
+
+	/* now walk through the clauses, and keep those that were not reduced */
+	for (i = 0; i < nmvclauses; i++)
+	{
+		if (! reduced[i])
+			reduced_clauses = lappend(reduced_clauses, mvclauses[i]);
+	}
+
+	pfree(reduced);
+	pfree(mvclauses);
+	pfree(mvattnums);
+
+	return reduced_clauses;
+}
diff --git a/src/backend/utils/mvstats/common.c b/src/backend/utils/mvstats/common.c
index 36757d5..0edaaa6 100644
--- a/src/backend/utils/mvstats/common.c
+++ b/src/backend/utils/mvstats/common.c
@@ -50,7 +50,8 @@ build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
 		/*
 		 * Analyze functional dependencies of columns.
 		 */
-		deps = build_mv_dependencies(numrows, rows, attrs, natts, vacattrstats);
+		if (mvstats->deps_enabled)
+			deps = build_mv_dependencies(numrows, rows, attrs, natts, vacattrstats);
 
 		/* store the histogram / MCV list in the catalog */
 		update_mv_stats(mvstats[i].mvoid, deps);
@@ -147,6 +148,7 @@ list_mv_stats(Oid relid, int *nstats, bool built_only)
 
 		result[*nstats].mvoid = HeapTupleGetOid(htup);
 		result[*nstats].stakeys = buildint2vector(stats->stakeys.values, stats->stakeys.dim1);
+		result[*nstats].deps_enabled = stats->deps_enabled;
 		result[*nstats].deps_built = stats->deps_built;
 		*nstats += 1;
 	}
@@ -270,3 +272,85 @@ compare_scalars_memcmp_2(const void *a, const void *b)
 {
 	return memcmp(a, b, sizeof(Datum));
 }
+
+
+/* initialize multi-dimensional sort */
+MultiSortSupport
+multi_sort_init(int ndims)
+{
+	MultiSortSupport mss;
+
+	Assert(ndims >= 2);
+
+	mss = (MultiSortSupport)palloc0(offsetof(MultiSortSupportData, ssup)
+									+ sizeof(SortSupportData)*ndims);
+
+	mss->ndims = ndims;
+
+	return mss;
+}
+
+/*
+ * add sort into for dimension 'dim' (index into vacattrstats) to mss,
+ * at the position 'sortattr'
+ */
+void
+multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
+						int dim, VacAttrStats **vacattrstats)
+{
+	/* first, lookup StdAnalyzeData for the dimension (attribute) */
+	SortSupportData ssup;
+	StdAnalyzeData *tmp = (StdAnalyzeData *)vacattrstats[dim]->extra_data;
+
+	Assert(mss != NULL);
+	Assert(sortdim < mss->ndims);
+
+	/* initialize sort support, etc. */
+	memset(&ssup, 0, sizeof(ssup));
+	ssup.ssup_cxt = CurrentMemoryContext;
+
+	/* We always use the default collation for statistics */
+	ssup.ssup_collation = DEFAULT_COLLATION_OID;
+	ssup.ssup_nulls_first = false;
+
+	PrepareSortSupportFromOrderingOp(tmp->ltopr, &ssup);
+
+	mss->ssup[sortdim] = ssup;
+}
+
+/* compare all the dimensions in the selected order */
+int
+multi_sort_compare(const void *a, const void *b, void *arg)
+{
+	int i;
+	SortItem *ia = (SortItem*)a;
+	SortItem *ib = (SortItem*)b;
+
+	MultiSortSupport mss = (MultiSortSupport)arg;
+
+	for (i = 0; i < mss->ndims; i++)
+	{
+		int	compare;
+
+		compare = ApplySortComparator(ia->values[i], ia->isnull[i],
+									  ib->values[i], ib->isnull[i],
+									  &mss->ssup[i]);
+
+		if (compare != 0)
+			return compare;
+
+	}
+
+	/* equal by default */
+	return 0;
+}
+
+/* compare selected dimension */
+int
+multi_sort_compare_dim(int dim, const SortItem *a, const SortItem *b,
+					   MultiSortSupport mss)
+{
+	return ApplySortComparator(a->values[dim], a->isnull[dim],
+							   b->values[dim], b->isnull[dim],
+							   &mss->ssup[dim]);
+}
diff --git a/src/backend/utils/mvstats/common.h b/src/backend/utils/mvstats/common.h
index f511c4e..b98ceb7 100644
--- a/src/backend/utils/mvstats/common.h
+++ b/src/backend/utils/mvstats/common.h
@@ -59,6 +59,28 @@ typedef struct
 	int		   *tupnoLink;
 } CompareScalarsContext;
 
+/* multi-sort */
+typedef struct MultiSortSupportData {
+	int				ndims;		/* number of dimensions supported by the */
+	SortSupportData	ssup[1];	/* sort support data for each dimension */
+} MultiSortSupportData;
+
+typedef MultiSortSupportData* MultiSortSupport;
+
+typedef struct SortItem {
+	Datum  *values;
+	bool   *isnull;
+} SortItem;
+
+MultiSortSupport multi_sort_init(int ndims);
+
+void multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
+							  int dim, VacAttrStats **vacattrstats);
+
+int multi_sort_compare(const void *a, const void *b, void *arg);
+
+int multi_sort_compare_dim(int dim, const SortItem *a,
+						   const SortItem *b, MultiSortSupport mss);
 
 VacAttrStats ** lookup_var_attr_stats(int2vector *attrs,
 									  int natts, VacAttrStats **vacattrstats);
diff --git a/src/backend/utils/mvstats/dependencies.c b/src/backend/utils/mvstats/dependencies.c
index b900efd..93a2fa6 100644
--- a/src/backend/utils/mvstats/dependencies.c
+++ b/src/backend/utils/mvstats/dependencies.c
@@ -15,6 +15,7 @@
  */
 
 #include "common.h"
+#include "utils/lsyscache.h"
 
 /*
  * Mine functional dependencies between columns, in the form (A => B),
@@ -56,6 +57,20 @@
  * columns on the 'left' side, i.e. a condition for the dependency.
  * That is dependencies [A,B] => C and so on.
  *
+ * TODO The implementation may/should be smart enough not to mine both
+ *      [A => B] and [A,C => B], because the second dependency is a
+ *      consequence of the first one (if values of A determine values
+ *      of B, adding another column won't change that). The ANALYZE
+ *      should first analyze 1:1 dependencies, then 2:1 dependencies
+ *      (and skip the already identified ones), etc.
+ *
+ * For example the dependency [city name => zip code] is much weaker
+ * than [city name, state name => zip code], because there may be
+ * multiple cities with the same name in various states. It's not
+ * perfect though - there are probably cities with the same name within
+ * the same state, but this is relatively rare occurence hopefully.
+ * More about this in the section about dependency mining.
+ *
  * Handling multiple columns on the right side is not necessary, as such
  * dependencies may be decomposed into a set of dependencies with
  * the same meaning, one for each column on the right side. For example
@@ -163,19 +178,61 @@
  * FIXME Not sure if this handles NULL values properly (not sure how to
  *       do that). We assume that NULL means 0 for now, handling it just
  *       like any other value.
+ *
+ * FIXME This builds a complete set of dependencies, i.e. including
+ *       transitive dependencies - if we identify [A => B] and [B => C],
+ *       we're likely to identify [A => C] too. It might be better to
+ *       keep only the minimal set of dependencies, i.e. prune all the
+ *       dependencies that we can recreate by transivitity.
+ *
+ *       There are two conceptual ways to do that:
+ *
+ *       (a) generate all the rules, and then prune the rules that may
+ *           be recteated by combining other dependencies, or
+ *
+ *       (b) performing the 'is combination of other dependencies' check
+ *           before actually doing the work
+ *
+ *       The second option has the advantage that we don't really need
+ *       to perform the sort/count. It's not sufficient alone, though,
+ *       because we may discover the dependencies in the wrong order.
+ *       For example [A => B], [A => C] and then [B => C]. None of those
+ *       dependencies is a combination of the already known ones, yet
+ *       [A => C] is a combination of [A => B] and [B => C].
+ *
+ * TODO Not sure the current NULL handling makes much sense. It's
+ *      handled like a regular value (NULL == NULL), so all NULLs in
+ *      a single column form a single group. Maybe that's not the right
+ *      thing to do, especially with equality conditions - in that case
+ *      NULLs are irrelevant. So maybe the right solution would be to
+ *      just ignore NULL values here?
+ *
+ *      However simply "ignoring" the NULL values does not seem like
+ *      a good idea - imagine columns A and B, where for each value of
+ *      A, values in B are constant (same for the whole group) or NULL.
+ *      Let's say only 10% of B values in each group is not NULL. Then
+ *      ignoring the NULL values will result in 10x misestimate (and
+ *      it's trivial to construct arbitrary errors). So maybe handling
+ *      NULL values just like a regular value is the right thing here.
+ *
+ *      Or maybe NULL values should be treated differently on each side
+ *      of the dependency? E.g. as ignored on the left (condition) and
+ *      as regular values on the right - this seems consistent with how
+ *      equality clauses work, as equality clause means 'NOT NULL'.
+ *      So if we say [A => B] then it may also imply "NOT NULL" on the
+ *      right side.
  */
 MVDependencies
 build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
 					  int natts, VacAttrStats **vacattrstats)
 {
 	int i;
-	bool isNull;
-	Size len = 2 * sizeof(Datum);	/* only simple associations a => b */
 	int numattrs = attrs->dim1;
 
 	/* result */
 	int ndeps = 0;
 	MVDependencies	dependencies = NULL;
+	MultiSortSupport mss = multi_sort_init(2);	/* 2 dimensions for now */
 
 	/* TODO Maybe this should be somehow related to the number of
 	 *      distinct columns in the two columns we're currently analyzing.
@@ -195,8 +252,24 @@ build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
 	 */
 	VacAttrStats **stats = lookup_var_attr_stats(attrs, natts, vacattrstats);
 
-	/* We'll reuse the same array for all the combinations */
-	Datum * values = (Datum*)palloc0(numrows * 2 * sizeof(Datum));
+	/*
+	 * We'll reuse the same array for all the 2-column combinations.
+	 *
+	 * It's possible to sort the sample rows directly, but this seemed
+	 * somehow simples / less error prone. Another option would be to
+	 * allocate the arrays for each SortItem separately, but that'd be
+	 * significant overhead (not just CPU, but especially memory bloat).
+	 */
+	SortItem * items = (SortItem*)palloc0(numrows * sizeof(SortItem));
+
+	Datum *values = (Datum*)palloc0(sizeof(Datum) * numrows * 2);
+	bool  *isnull = (bool*)palloc0(sizeof(bool) * numrows * 2);
+
+	for (i = 0; i < numrows; i++)
+	{
+		items[i].values = &values[i * 2];
+		items[i].isnull = &isnull[i * 2];
+	}
 
 	Assert(numattrs >= 2);
 
@@ -213,9 +286,12 @@ build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
 	 */
 	for (dima = 0; dima < numattrs; dima++)
 	{
+		/* prepare the sort function for the first dimension */
+		multi_sort_add_dimension(mss, 0, dima, vacattrstats);
+
 		for (dimb = 0; dimb < numattrs; dimb++)
 		{
-			Datum val_a, val_b;
+			SortItem current;
 
 			/* number of groups supporting / contradicting the dependency */
 			int n_supporting = 0;
@@ -232,14 +308,27 @@ build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
 			if (dima == dimb)
 				continue;
 
+			/* prepare the sort function for the second dimension */
+			multi_sort_add_dimension(mss, 1, dimb, vacattrstats);
+
+			/* reset the values and isnull flags */
+			memset(values, 0, sizeof(Datum) * numrows * 2);
+			memset(isnull, 0, sizeof(bool)  * numrows * 2);
+
 			/* accumulate all the data for both columns into an array and sort it */
 			for (i = 0; i < numrows; i++)
 			{
-				values[i*2]   = heap_getattr(rows[i], attrs->values[dima], stats[dima]->tupDesc, &isNull);
-				values[i*2+1] = heap_getattr(rows[i], attrs->values[dimb], stats[dimb]->tupDesc, &isNull);
+				items[i].values[0]
+					= heap_getattr(rows[i], attrs->values[dima],
+									stats[dima]->tupDesc, &items[i].isnull[0]);
+
+				items[i].values[1]
+					= heap_getattr(rows[i], attrs->values[dimb],
+									stats[dimb]->tupDesc, &items[i].isnull[1]);
 			}
 
-			qsort_arg((void *) values, numrows, sizeof(Datum) * 2, compare_scalars_memcmp, &len);
+			qsort_arg((void *) items, numrows, sizeof(SortItem),
+					  multi_sort_compare, mss);
 
 			/*
 			 * Walk through the array, split it into rows according to
@@ -254,13 +343,13 @@ build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
 			 */
 
 			/* start with values from the first row */
-			val_a = values[0];
-			val_b = values[1];
+			current = items[0];
 			group_size  = 1;
 
 			for (i = 1; i < numrows; i++)
 			{
-				if (values[2*i] != val_a)	/* end of the group */
+				/* end of the group */
+				if (multi_sort_compare_dim(0, &items[i], &current, mss) != 0)
 				{
 					/*
 					 * If there are no contradicting rows, count it as
@@ -271,36 +360,49 @@ build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
 					 * impossible to identify [unique,unique] cases, but
 					 * that's probably a different case. This is more
 					 * about [zip => city] associations etc.
+					 *
+					 * If there are violations, count the group/rows as
+					 * a violation.
+					 *
+					 * It may ne neither, if the group is too small (does
+					 * not contain at least min_group_size rows).
 					 */
-					n_supporting += ((n_violations == 0) && (group_size >= min_group_size)) ? 1 : 0;
-					n_contradicting += (n_violations != 0) ? 1 : 0;
-
-					n_supporting_rows += ((n_violations == 0) && (group_size >= min_group_size)) ? group_size : 0;
-					n_contradicting_rows += (n_violations > 0) ? group_size : 0;
+					if ((n_violations == 0) && (group_size >= min_group_size))
+					{
+						n_supporting +=  1;
+						n_supporting_rows += group_size;
+					}
+					else if (n_violations > 0)
+					{
+						n_contradicting +=  1;
+						n_contradicting_rows += group_size;
+					}
 
 					/* current values start a new group */
-					val_a = values[2*i];
-					val_b = values[2*i+1];
 					n_violations = 0;
-					group_size = 1;
+					group_size = 0;
 				}
-				else
+				/* mismatch of a B value is contradicting */
+				else if (multi_sort_compare_dim(1, &items[i], &current, mss) != 0)
 				{
-					if (values[2*i+1] != val_b)	/* mismatch of a B value is contradicting */
-					{
-						val_b = values[2*i+1];
-						n_violations += 1;
-					}
-
-					group_size += 1;
+					n_violations += 1;
 				}
+
+				current = items[i];
+				group_size += 1;
 			}
 
-			/* handle the last group */
-			n_supporting += ((n_violations == 0) && (group_size >= min_group_size)) ? 1 : 0;
-			n_contradicting += (n_violations != 0) ? 1 : 0;
-			n_supporting_rows += ((n_violations == 0) && (group_size >= min_group_size)) ? group_size : 0;
-			n_contradicting_rows += (n_violations > 0) ? group_size : 0;
+			/* handle the last group (just like above) */
+			if ((n_violations == 0) && (group_size >= min_group_size))
+			{
+				n_supporting += 1;
+				n_supporting_rows += group_size;
+			}
+			else if (n_violations)
+			{
+				n_contradicting += 1;
+				n_contradicting_rows += group_size;
+			}
 
 			/*
 			 * See if the number of rows supporting the association is at least
@@ -338,7 +440,11 @@ build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
 		}
 	}
 
+	pfree(items);
 	pfree(values);
+	pfree(isnull);
+	pfree(stats);
+	pfree(mss);
 
 	return dependencies;
 }
diff --git a/src/include/utils/mvstats.h b/src/include/utils/mvstats.h
index 2b59c2d..a074253 100644
--- a/src/include/utils/mvstats.h
+++ b/src/include/utils/mvstats.h
@@ -18,24 +18,34 @@
 
 /*
  * Basic info about the stats, used when choosing what to use
- * 
- * TODO Add info about what statistics is available (histogram, MCV,
- *      hashed MCV, functional dependencies).
  */
 typedef struct MVStatsData {
 	Oid			mvoid;		/* OID of the stats in pg_mv_statistic */
 	int2vector *stakeys;	/* attnums for columns in the stats */
+
+	/* statistics requested in ALTER TABLE ... ADD STATISTICS */
+	bool		deps_enabled;	/* analyze functional dependencies */
+
+	/* available statistics (computed by ANALYZE) */
 	bool		deps_built;	/* functional dependencies available */
 } MVStatsData;
 
 typedef struct MVStatsData *MVStats;
 
+/*
+ * Degree of how much MCV item / histogram bucket matches a clause.
+ * This is then considered when computing the selectivity.
+ */
+#define MVSTATS_MATCH_NONE		0		/* no match at all */
+#define MVSTATS_MATCH_PARTIAL	1		/* partial match */
+#define MVSTATS_MATCH_FULL		2		/* full match */
 
 #define MVSTATS_MAX_DIMENSIONS	8		/* max number of attributes */
 
-/* An associative rule, tracking [a => b] dependency.
- *
- * TODO Make this work with multiple columns on both sides.
+
+/*
+ * Functional dependencies, tracking column-level relationships (values
+ * in one column determine values in another one).
  */
 typedef struct MVDependencyData {
 	int16	a;
@@ -61,6 +71,7 @@ typedef MVDependenciesData* MVDependencies;
  *      stats specified using flags (or something like that).
  */
 MVStats list_mv_stats(Oid relid, int *nstats, bool built_only);
+bytea * fetch_mv_rules(Oid mvoid);
 
 bytea * fetch_mv_dependencies(Oid mvoid);
 
diff --git a/src/test/regress/expected/mv_dependencies.out b/src/test/regress/expected/mv_dependencies.out
new file mode 100644
index 0000000..dbfb5cf
--- /dev/null
+++ b/src/test/regress/expected/mv_dependencies.out
@@ -0,0 +1,175 @@
+-- data type passed by value
+CREATE TABLE functional_dependencies (
+    a INT,
+    b INT,
+    c INT
+);
+-- unknown column
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (unknown_column);
+ERROR:  column "unknown_column" referenced in statistics does not exist
+-- single column
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a);
+ERROR:  multivariate stats require 2 or more columns
+-- single column, duplicated
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a, a);
+ERROR:  duplicate column name in statistics definition
+-- two columns, one duplicated
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a, a, b);
+ERROR:  duplicate column name in statistics definition
+-- unknown option
+ALTER TABLE functional_dependencies ADD STATISTICS (unknown_option) ON (a, b, c);
+ERROR:  unrecognized STATISTICS option "unknown_option"
+-- correct command
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a, b, c);
+-- random data (no functional dependencies)
+INSERT INTO functional_dependencies
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built | pg_mv_stats_dependencies_show 
+--------------+------------+-------------------------------
+ t            | f          | 
+(1 row)
+
+TRUNCATE functional_dependencies;  
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built | pg_mv_stats_dependencies_show 
+--------------+------------+-------------------------------
+ t            | t          | 1 => 2, 1 => 3, 2 => 3
+(1 row)
+
+TRUNCATE functional_dependencies;  
+-- a => b, a => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built | pg_mv_stats_dependencies_show 
+--------------+------------+-------------------------------
+ t            | t          | 1 => 2, 1 => 3
+(1 row)
+
+TRUNCATE functional_dependencies;  
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO functional_dependencies 
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX fdeps_idx ON functional_dependencies (a, b);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built | pg_mv_stats_dependencies_show 
+--------------+------------+-------------------------------
+ t            | t          | 1 => 2, 1 => 3, 2 => 3
+(1 row)
+
+EXPLAIN (COSTS off)
+ SELECT * FROM functional_dependencies WHERE a = 10 AND b = 5;
+                 QUERY PLAN                  
+---------------------------------------------
+ Bitmap Heap Scan on functional_dependencies
+   Recheck Cond: ((a = 10) AND (b = 5))
+   ->  Bitmap Index Scan on fdeps_idx
+         Index Cond: ((a = 10) AND (b = 5))
+(4 rows)
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+DROP TABLE functional_dependencies;
+-- varlena type (text)
+CREATE TABLE functional_dependencies (
+    a TEXT,
+    b TEXT,
+    c TEXT
+);
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a, b, c);
+-- random data (no functional dependencies)
+INSERT INTO functional_dependencies
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built | pg_mv_stats_dependencies_show 
+--------------+------------+-------------------------------
+ t            | f          | 
+(1 row)
+
+TRUNCATE functional_dependencies;  
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built | pg_mv_stats_dependencies_show 
+--------------+------------+-------------------------------
+ t            | t          | 1 => 2, 1 => 3, 2 => 3
+(1 row)
+
+TRUNCATE functional_dependencies;  
+-- a => b, a => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built | pg_mv_stats_dependencies_show 
+--------------+------------+-------------------------------
+ t            | t          | 1 => 2, 1 => 3
+(1 row)
+
+TRUNCATE functional_dependencies;  
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO functional_dependencies 
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX fdeps_idx ON functional_dependencies (a, b);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built | pg_mv_stats_dependencies_show 
+--------------+------------+-------------------------------
+ t            | t          | 1 => 2, 1 => 3, 2 => 3
+(1 row)
+
+EXPLAIN (COSTS off)
+ SELECT * FROM functional_dependencies WHERE a = '10' AND b = '5';
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Bitmap Heap Scan on functional_dependencies
+   Recheck Cond: ((a = '10'::text) AND (b = '5'::text))
+   ->  Bitmap Index Scan on fdeps_idx
+         Index Cond: ((a = '10'::text) AND (b = '5'::text))
+(4 rows)
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+DROP TABLE functional_dependencies;
+-- NULL values (mix of int and text columns)
+CREATE TABLE functional_dependencies (
+    a INT,
+    b TEXT,
+    c INT,
+    d TEXT
+);
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a, b, c, d);
+INSERT INTO functional_dependencies
+     SELECT
+         mod(i, 100),
+         (CASE WHEN mod(i, 200) = 0 THEN NULL ELSE mod(i,200) END),
+         mod(i, 400),
+         (CASE WHEN mod(i, 300) = 0 THEN NULL ELSE mod(i,600) END)
+     FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+ deps_enabled | deps_built |     pg_mv_stats_dependencies_show      
+--------------+------------+----------------------------------------
+ t            | t          | 2 => 1, 3 => 1, 3 => 2, 4 => 1, 4 => 2
+(1 row)
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+DROP TABLE functional_dependencies;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index e0ae2f2..c41762c 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -109,3 +109,6 @@ test: event_trigger
 
 # run stats by itself because its delay may be insufficient under heavy load
 test: stats
+
+# run tests of multivariate stats
+test: mv_dependencies
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 7f762bd..3845b0f 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -152,3 +152,4 @@ test: with
 test: xml
 test: event_trigger
 test: stats
+test: mv_dependencies
diff --git a/src/test/regress/sql/mv_dependencies.sql b/src/test/regress/sql/mv_dependencies.sql
new file mode 100644
index 0000000..5d1ad52
--- /dev/null
+++ b/src/test/regress/sql/mv_dependencies.sql
@@ -0,0 +1,153 @@
+-- data type passed by value
+CREATE TABLE functional_dependencies (
+    a INT,
+    b INT,
+    c INT
+);
+
+-- unknown column
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (unknown_column);
+
+-- single column
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a);
+
+-- single column, duplicated
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a, a);
+
+-- two columns, one duplicated
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a, a, b);
+
+-- unknown option
+ALTER TABLE functional_dependencies ADD STATISTICS (unknown_option) ON (a, b, c);
+
+-- correct command
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a, b, c);
+
+-- random data (no functional dependencies)
+INSERT INTO functional_dependencies
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;  
+
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;  
+
+-- a => b, a => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;  
+
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO functional_dependencies 
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX fdeps_idx ON functional_dependencies (a, b);
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+EXPLAIN (COSTS off)
+ SELECT * FROM functional_dependencies WHERE a = 10 AND b = 5;
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+DROP TABLE functional_dependencies;
+
+-- varlena type (text)
+CREATE TABLE functional_dependencies (
+    a TEXT,
+    b TEXT,
+    c TEXT
+);
+
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a, b, c);
+
+-- random data (no functional dependencies)
+INSERT INTO functional_dependencies
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;  
+
+-- a => b, a => c, b => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;  
+
+-- a => b, a => c
+INSERT INTO functional_dependencies
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+TRUNCATE functional_dependencies;  
+
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO functional_dependencies 
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX fdeps_idx ON functional_dependencies (a, b);
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+EXPLAIN (COSTS off)
+ SELECT * FROM functional_dependencies WHERE a = '10' AND b = '5';
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+DROP TABLE functional_dependencies;
+
+-- NULL values (mix of int and text columns)
+CREATE TABLE functional_dependencies (
+    a INT,
+    b TEXT,
+    c INT,
+    d TEXT
+);
+
+ALTER TABLE functional_dependencies ADD STATISTICS (dependencies) ON (a, b, c, d);
+
+INSERT INTO functional_dependencies
+     SELECT
+         mod(i, 100),
+         (CASE WHEN mod(i, 200) = 0 THEN NULL ELSE mod(i,200) END),
+         mod(i, 400),
+         (CASE WHEN mod(i, 300) = 0 THEN NULL ELSE mod(i,600) END)
+     FROM generate_series(1,10000) s(i);
+
+ANALYZE functional_dependencies;
+
+SELECT deps_enabled, deps_built, pg_mv_stats_dependencies_show(stadeps)
+  FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'functional_dependencies'::regclass;
+DROP TABLE functional_dependencies;
-- 
2.0.5

