>From d6d169988a8ccef1e41e9620599bdfc83a192433 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tv@fuzzy.cz>
Date: Sun, 11 Jan 2015 20:15:37 +0100
Subject: [PATCH 3/4] multivariate MCV lists

- extends the pg_mv_statistic catalog (add 'mcv' fields)
- building the MCV lists during ANALYZE
- simple estimation while planning the queries

FIX: don't build MCV list by default
FIX: analyze MCV lists only when requested
FIX: improvements in clauselist_mv_selectivity_mcvlist()
FIX: comment about upper selectivity boundaries for MCV lists
FIX: switch MCV build to multi_sort functions (from dependencies)

This adds support for proper sorting (per data type), arbitrary
data types (as long as they have '<' operator) and NULL values
to building MCV lists. It still needs a fair amount of love, and
does nothing to serializing/deserializing the MCV lists.

FIX: comment about using max_mcv_items (ADD STATISTICS option)
FIX: initial support for all data types and NULL in MCV lists

This changes the serialize_mcvlist/update_mv_stats in a bit
strange way (passing VacAttrStats all over the place). This
needs to be improved, somehow, before rebasing into the MCV
part. Otherwise it'll cause needless conflicts.

FIX: fixed MCV build / removed debugging WARNING log message
FIX: refactoring lookup_var_attr_stats() - moving to common.c, static

This only includes changes in the common part + functional dependencies.

FIX: refactoring lookup_var_attr_stats() / MCV lists
FIX: a set of regression tests for MCV lists

This is mostly equal to a combination of all the regression tests
for functional dependencies.

One of the tests (EXPLAIN with TEXT columns) currently fails, and
produces Index Scan instead of Bitmap Index Scan. Will investigate.

FIX: comment about memory corruption in deserializing MCV list
FIX: correct MCV spelling on a few places (MVC -> MCV)
FIX: get rid of the custom comparators in mcv.c
FIX: use USE_ASSERT_CHECKING for assert-only variable (MCV)
FIX: check 'mcv' and 'mcv_max_items' options in ADD STATISTICS
FIX: proper handling of 'mcv_max_items' options (constants etc.)
FIX: check that either dependencies or MCV were requested
FIX: improved comments / docs for MCV lists
FIX: move DimensionInfo to common.h
FIX: move MCV list definitions after functional dependencies
FIX: incorrect memcpy() when building MCV list, causing segfaults
FIX: replace variables by macros in MCV serialize/deserialize
FIX: rework clauselist_mv_split() to call clause_is_mv_compatible()

Mostly duplicated the code, making it difficult to add more clause
types etc.

FIX: fixed estimation of equality clauses using MCV lists
FIX: add support for 'IS [NOT] NULL' support to MCV lists
FIX: add regression test for ADD STATISTICS options (MCV list)
FIX: added regression test to test IS [NOT] NULL with MCV lists
FIX: updated comments in clausesel.c (mcv)
FIX: obsolete Assert in mcv code (indexes -> ITEM_INDEXES)
FIX: make regression tests parallel-happy (MCV lists)
---
 src/backend/catalog/system_views.sql     |    4 +-
 src/backend/commands/tablecmds.c         |   47 +-
 src/backend/optimizer/path/clausesel.c   |  788 ++++++++++++++++++++++-
 src/backend/utils/mvstats/Makefile       |    2 +-
 src/backend/utils/mvstats/common.c       |   65 +-
 src/backend/utils/mvstats/common.h       |   12 +-
 src/backend/utils/mvstats/dependencies.c |   13 +-
 src/backend/utils/mvstats/mcv.c          | 1002 ++++++++++++++++++++++++++++++
 src/include/catalog/pg_mv_statistic.h    |   18 +-
 src/include/catalog/pg_proc.h            |    2 +
 src/include/utils/mvstats.h              |   68 +-
 src/test/regress/expected/mv_mcv.out     |  210 +++++++
 src/test/regress/expected/rules.out      |    4 +-
 src/test/regress/parallel_schedule       |    2 +-
 src/test/regress/serial_schedule         |    1 +
 src/test/regress/sql/mv_mcv.sql          |  181 ++++++
 16 files changed, 2370 insertions(+), 49 deletions(-)
 create mode 100644 src/backend/utils/mvstats/mcv.c
 create mode 100644 src/test/regress/expected/mv_mcv.out
 create mode 100644 src/test/regress/sql/mv_mcv.sql

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index da957fc..8acf160 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -158,7 +158,9 @@ CREATE VIEW pg_mv_stats AS
         C.relname AS tablename,
         S.stakeys AS attnums,
         length(S.stadeps) as depsbytes,
-        pg_mv_stats_dependencies_info(S.stadeps) as depsinfo
+        pg_mv_stats_dependencies_info(S.stadeps) as depsinfo,
+        length(S.stamcv) AS mcvbytes,
+        pg_mv_stats_mcvlist_info(S.stamcv) AS mcvinfo
     FROM (pg_mv_statistic S JOIN pg_class C ON (C.oid = S.starelid))
         LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace);
 
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 3c82b89..1f08c1c 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -11651,7 +11651,13 @@ static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
 	Relation	mvstatrel;
 
 	/* by default build everything */
-	bool 	build_dependencies = false;
+	bool 	build_dependencies = false,
+			build_mcv = false;
+
+	int32 	max_mcv_items = -1;
+
+	/* options required because of other options */
+	bool	require_mcv = false;
 
 	Assert(IsA(def, StatisticsDef));
 
@@ -11706,6 +11712,29 @@ static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
 
 		if (strcmp(opt->defname, "dependencies") == 0)
 			build_dependencies = defGetBoolean(opt);
+		else if (strcmp(opt->defname, "mcv") == 0)
+			build_mcv = defGetBoolean(opt);
+		else if (strcmp(opt->defname, "max_mcv_items") == 0)
+		{
+			max_mcv_items = defGetInt32(opt);
+
+			/* this option requires 'mcv' to be enabled */
+			require_mcv = true;
+
+			/* sanity check */
+			if (max_mcv_items < MVSTAT_MCVLIST_MIN_ITEMS)
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("max number of MCV items must be at least %d",
+								MVSTAT_MCVLIST_MIN_ITEMS)));
+
+			else if (max_mcv_items > MVSTAT_MCVLIST_MAX_ITEMS)
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("max number of MCV items is %d",
+								MVSTAT_MCVLIST_MAX_ITEMS)));
+
+		}
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
@@ -11714,10 +11743,16 @@ static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
 	}
 
 	/* check that at least some statistics were requested */
-	if (! build_dependencies)
+	if (! (build_dependencies || build_mcv))
 		ereport(ERROR,
 				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("no statistics type (dependencies) was requested")));
+				 errmsg("no statistics type (dependencies, mcv) was requested")));
+
+	/* now do some checking of the options */
+	if (require_mcv && (! build_mcv))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("option 'mcv' is required by other options(s)")));
 
 	/* sort the attnums and build int2vector */
 	qsort(attnums, numcols, sizeof(int16), compare_int16);
@@ -11733,9 +11768,13 @@ static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
 	values[Anum_pg_mv_statistic_starelid-1] = ObjectIdGetDatum(RelationGetRelid(rel));
 
 	values[Anum_pg_mv_statistic_stakeys -1] = PointerGetDatum(stakeys);
+
 	values[Anum_pg_mv_statistic_deps_enabled -1] = BoolGetDatum(build_dependencies);
+	values[Anum_pg_mv_statistic_mcv_enabled   -1] = BoolGetDatum(build_mcv);
+	values[Anum_pg_mv_statistic_mcv_max_items    -1] = Int32GetDatum(max_mcv_items);
 
-	nulls[Anum_pg_mv_statistic_stadeps -1] = true;
+	nulls[Anum_pg_mv_statistic_stadeps -1]  = true;
+	nulls[Anum_pg_mv_statistic_stamcv   -1] = true;
 
 	/* insert the tuple into pg_mv_statistic */
 	mvstatrel = heap_open(MvStatisticRelationId, RowExclusiveLock);
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index 36e5bce..1446fa0 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -59,6 +59,18 @@ static List *clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
 								Oid varRelid, int nmvstats, MVStats mvstats,
 								SpecialJoinInfo *sjinfo);
 
+static int choose_mv_statistics(int nmvstats, MVStats mvstats,
+								Bitmapset *attnums);
+static List *clauselist_mv_split(PlannerInfo *root, SpecialJoinInfo *sjinfo,
+								 List *clauses, Oid varRelid,
+								 List **mvclauses, MVStats mvstats);
+
+static Selectivity clauselist_mv_selectivity(PlannerInfo *root,
+						List *clauses, MVStats mvstats);
+static Selectivity clauselist_mv_selectivity_mcvlist(PlannerInfo *root,
+									List *clauses, MVStats mvstats,
+									bool *fullmatch, Selectivity *lowsel);
+
 /****************************************************************************
  *		ROUTINES TO COMPUTE SELECTIVITIES
  ****************************************************************************/
@@ -195,8 +207,8 @@ clauselist_selectivity(PlannerInfo *root,
 	Bitmapset  *mvattnums = NULL;
 
 	/*
-	 * If there's exactly one clause, then no use in trying to match up pairs,
-	 * so just go directly to clause_selectivity().
+	 * If there's exactly one clause, then no use in trying to match up
+	 * pairs, so just go directly to clause_selectivity().
 	 */
 	if (list_length(clauses) == 1)
 		return clause_selectivity(root, (Node *) linitial(clauses),
@@ -222,6 +234,46 @@ clauselist_selectivity(PlannerInfo *root,
 		/* reduce clauses by applying functional dependencies rules */
 		clauses = clauselist_apply_dependencies(root, clauses, varRelid,
 												nmvstats, mvstats, sjinfo);
+
+		/*
+		 * recollect attributes from mv-compatible clauses (maybe we've
+		 * removed so many clauses we have a single mv-compatible attnum)
+		 */
+		mvattnums = collect_mv_attnums(root, clauses, varRelid, &relid, sjinfo);
+	}
+
+	/*
+	 * If there still are at least two columns, we'll try to select
+	 * a suitable multivariate stats.
+	 */
+	if (bms_num_members(mvattnums) >= 2)
+	{
+		/* fetch info from the catalog (not the serialized stats yet) */
+		mvstats = list_mv_stats(relid, &nmvstats, true);
+
+		/* see choose_mv_statistics() for details */
+		if (nmvstats > 0)
+		{
+			int idx = choose_mv_statistics(nmvstats, mvstats, mvattnums);
+
+			if (idx >= 0)	/* we have a matching stats */
+			{
+				MVStats mvstat = &mvstats[idx];
+
+				/* clauses compatible with multi-variate stats */
+				List	*mvclauses = NIL;
+
+				/* split the clauselist into regular and mv-clauses */
+				clauses = clauselist_mv_split(root, sjinfo, clauses,
+										varRelid, &mvclauses, mvstat);
+
+				/* we've chosen the histogram to match the clauses */
+				Assert(mvclauses != NIL);
+
+				/* compute the multivariate stats */
+				s1 *= clauselist_mv_selectivity(root, mvclauses, mvstat);
+			}
+		}
 	}
 
 	/*
@@ -899,6 +951,192 @@ clause_selectivity(PlannerInfo *root,
 	return s1;
 }
 
+
+/*
+ * Estimate selectivity for the list of MV-compatible clauses, using that
+ * particular histogram.
+ *
+ * When we hit a single bucket, we don't know what portion of it actually
+ * matches the clauses (e.g. equality), and we use 1/2 the bucket by
+ * default. However, the MV histograms are usually less detailed than
+ * the per-column ones, meaning the sum of buckets is often quite high
+ * (thanks to combining a lot of "partially hit" buckets).
+ *
+ * There are several ways to improve this, usually with cases when it
+ * won't really help. Also, the more complex the process, the worse
+ * the failures (i.e. misestimates).
+ *
+ * (1) Use the MV histogram only as a way to combine multiple
+ *     per-column histograms, essentially rewriting
+ *
+ *       P(A & B) = P(A) * P(B|A)
+ *
+ *     where P(B|A) may be computed using a proper "slice" of the
+ *     histogram, by first selecting only buckets where A is true, and
+ *     then using the boundaries to 'restrict' the per-colunm histogram.
+ *
+ *     With more clauses, it gets more complicated, of course
+ *
+ *       P(A & B & C) = P(A & C) * P(B|A & C)
+ *                    = P(A) * P(C|A) * P(B|A & C)
+ *
+ *     and so on.
+ * 
+ *     Of course, the question is how well and efficiently we can
+ *     compute the conditional probabilities - whether this approach
+ *     can improve the estimates (instead of amplifying the errors).
+ *
+ *     Also, this does not eliminate the need for histogram on [A,B,C].
+ *
+ * (2) Use multiple smaller (and more accurate) histograms, and combine
+ *     them using a process similar to the above. E.g. by assuming that
+ *     B and C are independent, we can rewrite
+ *
+ *       P(B|A & C) = P(B|A)
+ * 
+ *     so we can rewrite the whole formula to
+ * 
+ *       P(A & B & C) = P(A) * P(C|A) * P(B|A)
+ * 
+ *     and we're OK with two 2D histograms [A,C] and [A,B].
+ *
+ *     It'd be nice to perform some sort of statistical test (Fisher
+ *     or another chi-squared test) to identify independent components
+ *     and automatically separate them into smaller histograms.
+ *
+ * (3) Using the estimated number of distinct values in a bucket to
+ *     decide the selectivity of equality in the bucket (instead of
+ *     blindly using 1/2 of the bucket, we may use 1/ndistinct).
+ *     Of course, if the ndistinct estimate is way off, or when the
+ *     distribution is not uniform (one distict items get much more
+ *     items), this will fail. Also, we currently don't have ndistinct
+ *     estimate available at this moment (but it shouldn't be that
+ *     difficult to compute as ndistinct and ntuples should be available).
+ *
+ * TODO Clamp the selectivity by min of the per-clause selectivities
+ *      (i.e. the selectivity of the most restrictive clause), because
+ *      that's the maximum we can ever get from ANDed list of clauses.
+ *      This may probably prevent issues with hitting too many buckets
+ *      and low precision histograms.
+ *
+ * TODO We may support some additional conditions, most importantly
+ *      those matching multiple columns (e.g. "a = b" or "a < b").
+ *      Ultimately we could track multi-table histograms for join
+ *      cardinality estimation.
+ *
+ * TODO Currently this is only estimating all clauses, or clauses
+ *      matching varRelid (when it's not 0). I'm not sure what's the
+ *      purpose of varRelid, but my assumption is this is used for
+ *      join conditions and such. In that case we can use those clauses
+ *      to restrict the other (i.e. filter the histogram buckets first,
+ *      before estimating the other clauses). This is essentially equal
+ *      to computing P(A|B) where "B" are the clauses not matching the
+ *      varRelid.
+ * 
+ * TODO Further thoughts on processing equality clauses - maybe it'd be
+ *      better to look for stats (with MCV) covered by the equality
+ *      clauses, because then we have a chance to find an exact match
+ *      in the MCV list, which is pretty much the best we can do. We may
+ *      also look at the least frequent MCV item, and use it as a upper
+ *      boundary for the selectivity (had there been a more frequent
+ *      item, it'd be in the MCV list).
+ *
+ *      These conditions may then be used as a condition for the other
+ *      selectivities, i.e. we may estimate P(A,B) first, and then
+ *      compute P(C|A,B) from another histogram. This may be useful when
+ *      we can estimate P(A,B) accurately (e.g. because it's a complete
+ *      equality match evaluated on MCV list), and then compute the
+ *      conditional probability P(C|A,B), giving us the requested stats
+ *
+ *          P(A,B,C) = P(A,B) * P(C|A,B)
+ *
+ * TODO There are several options for 'sanity clamping' the estimates.
+ *
+ *      First, if we have selectivities for each condition, then
+ * 
+ *          P(A,B) <= MIN(P(A), P(B))
+ *
+ *      Because additional conditions (connected by AND) can only lower
+ *      the probability.
+ *
+ *      So we can do some basic sanity checks using the single-variate
+ *      stats (the ones we have right now).
+ *
+ *      Second, when we have multivariate stats with a MCV list, then
+ *
+ *      (a) if we have a full equality condition (one equality condition
+ *          on each column) and we found a match in the MCV list, this is
+ *          the selectivity (and it's supposed to be exact)
+ *
+ *      (b) if we have a full equality condition and we haven't found a
+ *          match in the MCV list, then the selectivity is below the
+ *          lowest selectivity in the MCV list
+ *
+ *      (c) if we have a equality condition (not full), we can still
+ *          search the MCV for matches and use the sum of probabilities
+ *          as a lower boundary for the histogram (if there are no
+ *          matches in the MCV list, then we have no boundary)
+ *
+ *      Third, if there are multiple multivariate stats for a set of
+ *      clauses, we may compute all of them and then somehow aggregate
+ *      them - e.g. by choosing the minimum, median or average. The
+ *      multi-variate stats are susceptible to overestimation (because
+ *      we take 50% of the bucket for partial matches). Some stats may
+ *      give better estimates than others, but it's very difficult to
+ *      say determine that in advance which one is the best (it depends
+ *      on the number of buckets, number of additional columns not
+ *      referenced in the clauses etc.) so we may compute all and then
+ *      choose a sane aggregation (minimum seems like a good approach).
+ *      Of course, this may result in longer / more expensive estimation
+ *      (CPU-wise), but it may be worth it.
+ *
+ *      There are ways to address this, though. First, it's possible to
+ *      add a GUC choosing whether to do a 'simple' (using a single
+ *      stats expected to give the best estimate) and 'complex' (combining
+ *      the multiple estimates).
+ * 
+ *          multivariate_estimates = (simple|full)
+ *
+ *      Also, this might be enabled at a table level, by something like
+ * 
+ *          ALTER TABLE ... SET STATISTICS (simple|full)
+ * 
+ *      Which would make it possible to use this only for the tables
+ *      where the simple approach does not work.
+ * 
+ *      Also, there are ways to optimize this algorithmically. E.g. we
+ *      may try to get an estimate from a matching MCV list first, and
+ *      if we happen to get a "full equality match" we may stop computing
+ *      the estimates from other stats (for this condition) because
+ *      that's probably the best estimate we can really get.
+ *
+ * TODO When applying the clauses to the histogram/MCV list, we can do
+ *      that from the most selective clauses first, because that'll
+ *      eliminate the buckets/items sooner (so we'll be able to skip
+ *      them without inspection, which is more expensive).
+ */
+static Selectivity
+clauselist_mv_selectivity(PlannerInfo *root, List *clauses, MVStats mvstats)
+{
+	bool fullmatch = false;
+
+	/*
+	 * Lowest frequency in the MCV list (may be used as an upper bound
+	 * for full equality conditions that did not match any MCV item).
+	 */
+	Selectivity mcv_low = 0.0;
+
+	/* TODO Evaluate simple 1D selectivities, use the smallest one as
+	 *      an upper bound, product as lower bound, and sort the
+	 *      clauses in ascending order by selectivity (to optimize the
+	 *      MCV/histogram evaluation). 
+	 */
+
+	/* Evaluate the MCV selectivity */
+	return clauselist_mv_selectivity_mcvlist(root, clauses, mvstats,
+										   &fullmatch, &mcv_low);
+}
+
 /*
  * Collect attributes from mv-compatible clauses.
  * 
@@ -945,6 +1183,175 @@ collect_mv_attnums(PlannerInfo *root, List *clauses, Oid varRelid,
 }
 
 /*
+ * We're looking for statistics matching at least 2 attributes,
+ * referenced in the clauses compatible with multivariate statistics.
+ * The current selection criteria is very simple - we choose the
+ * statistics referencing the most attributes.
+ * 
+ * If there are multiple statistics referencing the same number of
+ * columns (from the clauses), the one with less source columns
+ * (as listed in the ADD STATISTICS when creating the statistics) wins.
+ * Other wise the first one wins.
+ *
+ * This is a very simple criteria, and has several weaknesses:
+ *
+ * (a) does not consider the accuracy of the statistics
+ *
+ *     If there are two histograms built on the same set of columns,
+ *     but one has 100 buckets and the other one has 1000 buckets (thus
+ *     likely providing better estimates), this is not currently
+ *     considered.
+ *
+ * (b) does not consider the type of statistics
+ *
+ *     If there are three statistics - one containing just a MCV list,
+ *     another one with just a histogram and a third one with both,
+ *     this is not considered.
+ *
+ * (c) does not consider the number of clauses
+ *
+ *     As explained, only the number of referenced attributes counts,
+ *     so if there are multiple clauses on a single attribute, this
+ *     still counts as a single attribute.
+ *
+ * (d) does not consider type of condition
+ *
+ *     Some clauses may work better with some statistics - for example
+ *     equality clauses probably work better with MCV lists than with
+ *     histograms. But IS [NOT] NULL conditions may often work better
+ *     with histograms (thanks to NULL-buckets).
+ *
+ * So for example with five WHERE conditions
+ *
+ *     WHERE (a = 1) AND (b = 1) AND (c = 1) AND (d = 1) AND (e = 1)
+ *
+ * and statistics on (a,b), (a,b,e) and (a,b,c,d), the last one will be
+ * selected as it references the most columns.
+ *
+ * Once we have selected the multivariate statistics, we split the list
+ * of clauses into two parts - conditions that are compatible with the
+ * selected stats, and conditions are estimated using simple statistics.
+ *
+ * From the example above, conditions
+ *
+ *     (a = 1) AND (b = 1) AND (c = 1) AND (d = 1)
+ *
+ * will be estimated using the multivariate statistics (a,b,c,d) while
+ * the last condition (e = 1) will get estimated using the regular ones.
+ *
+ * There are various alternative selection criteria (e.g. counting
+ * conditions instead of just referenced attributes), but eventually
+ * the best option should be to combine multiple statistics. But that's
+ * much harder to do correctly.
+ *
+ * TODO Select multiple statistics and combine them when computing
+ *      the estimate.
+ *
+ * TODO This will probably have to consider compatibility of clauses,
+ *      because 'dependencies' will probably work only with equality
+ *      clauses.
+ */
+static int
+choose_mv_statistics(int nmvstats, MVStats mvstats, Bitmapset *attnums)
+{
+	int i, j;
+
+	int choice = -1;
+	int current_matches = 1;					/* goal #1: maximize */
+	int current_dims = (MVSTATS_MAX_DIMENSIONS+1);	/* goal #2: minimize */
+
+	/*
+	 * Walk through the statistics (simple array with nmvstats elements)
+	 * and for each one count the referenced attributes (encoded in
+	 * the 'attnums' bitmap).
+	 */
+	for (i = 0; i < nmvstats; i++)
+	{
+		/* columns matching this statistics */
+		int matches = 0;
+
+		int2vector * attrs = mvstats[i].stakeys;
+		int	numattrs = mvstats[i].stakeys->dim1;
+
+		/* count columns covered by the histogram */
+		for (j = 0; j < numattrs; j++)
+			if (bms_is_member(attrs->values[j], attnums))
+				matches++;
+
+		/*
+		 * Use this statistics when it improves the number of matches or
+		 * when it matches the same number of attributes but is smaller.
+		 */
+		if ((matches > current_matches) ||
+			((matches == current_matches) && (current_dims > numattrs)))
+		{
+			choice = i;
+			current_matches = matches;
+			current_dims = numattrs;
+		}
+	}
+
+	return choice;
+}
+
+/*
+ * This splits the clauses list into two parts - one containing clauses
+ * that will be evaluated using the chosen histogram, and the remaining
+ * clauses (either non-mvcompatible, or not related to the histogram).
+ */
+static List *
+clauselist_mv_split(PlannerInfo *root, SpecialJoinInfo *sjinfo,
+					List *clauses, Oid varRelid, List **mvclauses,
+					MVStats mvstats)
+{
+	int i;
+	ListCell *l;
+	List	 *non_mvclauses = NIL;
+
+	/* FIXME is there a better way to get info on int2vector? */
+	int2vector * attrs = mvstats->stakeys;
+	int	numattrs = mvstats->stakeys->dim1;
+
+	/* erase the list of mv-compatible clauses */
+	*mvclauses = NIL;
+
+	foreach (l, clauses)
+	{
+		bool		match = false;	/* by default not mv-compatible */
+		AttrNumber	attnum;
+		Node	   *clause = (Node *) lfirst(l);
+
+		if (clause_is_mv_compatible(root, clause, varRelid, NULL, &attnum, sjinfo))
+		{
+			/* Is the attribute part of the selected stats? */
+			for (i = 0; i < numattrs; i++)
+				if (attrs->values[i] == attnum)
+					match = true;
+		}
+
+		if (match)
+		{
+			/*
+			 * The clause matches the selected stats, so extract the
+			 * clause from the RestrictInfo and put it to the
+			 * multivariate list. We'll use it directly.
+			 */
+			RestrictInfo * rinfo = (RestrictInfo *) clause;
+			*mvclauses = lappend(*mvclauses, (Node*)rinfo->clause);
+		}
+		else
+			non_mvclauses = lappend(non_mvclauses, clause);
+	}
+
+	/*
+	 * Perform regular estimation using the clauses incompatible
+	 * with the chosen histogram (or MV stats in general).
+	 */
+	return non_mvclauses;
+
+}
+
+/*
  * Determines whether the clause is compatible with multivariate stats,
  * and if it is, returns some additional information - varno (index
  * into simple_rte_array) and a bitmap of attributes. This is then
@@ -981,21 +1388,23 @@ clause_is_mv_compatible(PlannerInfo *root, Node *clause, Oid varRelid,
 		/* get the actual clause from the RestrictInfo (it's not an OR clause) */
 		clause = (Node*)rinfo->clause;
 
-		/* only simple opclauses are compatible with multivariate stats */
-		if (! is_opclause(clause))
-			return false;
-
 		/* we don't support join conditions at this moment */
 		if (treat_as_join_clause(clause, rinfo, varRelid, sjinfo))
 			return false;
 
-		/* is it 'variable op constant' ? */
-		if (list_length(((OpExpr *) clause)->args) == 2)
+		/*
+		 * Only simple opclauses and IS NULL tests are compatible with
+		 * multivariate stats at this point.
+		 */
+		if ((is_opclause(clause))
+			&& (list_length(((OpExpr *) clause)->args) == 2))
 		{
 			OpExpr	   *expr = (OpExpr *) clause;
 			bool		varonleft = true;
 			bool		ok;
 
+			/* is it 'variable op constant' ? */
+
 			ok = (bms_membership(rinfo->clause_relids) == BMS_SINGLETON) &&
 				(is_pseudo_constant_clause_relids(lsecond(expr->args),
 												rinfo->right_relids) ||
@@ -1032,8 +1441,11 @@ clause_is_mv_compatible(PlannerInfo *root, Node *clause, Oid varRelid,
 					return false;
 
 				/* Lookup info about the base relation (we need to pass the OID out) */
-				rte = planner_rt_fetch(var->varno, root);
-				*relid = rte->relid;
+				if (relid != NULL)
+				{
+					rte = planner_rt_fetch(var->varno, root);
+					*relid = rte->relid;
+				}
 
 				/*
 				 * If it's not a "<" or ">" or "=" operator, just ignore the
@@ -1051,6 +1463,45 @@ clause_is_mv_compatible(PlannerInfo *root, Node *clause, Oid varRelid,
 					}
 			}
 		}
+		else if (IsA(clause, NullTest)
+				 && IsA(((NullTest*)clause)->arg, Var))
+		{
+			RangeTblEntry * rte;
+			Var * var = (Var*)((NullTest*)clause)->arg;
+
+			/*
+			 * Simple variables only - otherwise the planner_rt_fetch seems to fail
+			 * (return NULL).
+			 *
+			 * TODO Maybe use examine_variable() would fix that?
+			 */
+			if (! (IsA(var, Var) && (varRelid == 0 || varRelid == var->varno)))
+				return false;
+
+			/*
+			 * Only consider this variable if (varRelid == 0) or when the varno
+			 * matches varRelid (see explanation at clause_selectivity).
+			 *
+			 * FIXME I suspet this may not be really necessary. The (varRelid == 0)
+			 *       part seems to be enforced by treat_as_join_clause().
+			 */
+			if (! ((varRelid == 0) || (varRelid == var->varno)))
+				return false;
+
+			/* Also skip special varno values, and system attributes ... */
+			if ((IS_SPECIAL_VARNO(var->varno)) || (! AttrNumberIsForUserDefinedAttr(var->varattno)))
+				return false;
+
+			/* Lookup info about the base relation (we need to pass the OID out) */
+			if (relid != NULL)
+			{
+				rte = planner_rt_fetch(var->varno, root);
+				*relid = rte->relid;
+			}
+			*attnum = var->varattno;
+
+			return true;
+		}
 	}
 
 	return false;
@@ -1256,3 +1707,320 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses, Oid varRelid,
 
 	return reduced_clauses;
 }
+
+/*
+ * Estimate selectivity of clauses using a MCV list.
+ *
+ * If there's no MCV list for the stats, the function returns 0.0.
+ *
+ * While computing the estimate, the function checks whether all the
+ * columns were matched with an equality condition. If that's the case,
+ * we can skip processing the histogram, as there can be no rows in
+ * it with the same values - all the rows matching the condition are
+ * represented by the MCV item. This can only happen with equality
+ * on all the attributes.
+ *
+ * The algorithm works like this:
+ *
+ *   1) mark all items as 'match'
+ *   2) walk through all the clauses
+ *   3) for a particular clause, walk through all the items
+ *   4) skip items that are already 'no match'
+ *   5) check clause for items that still match
+ *   6) sum frequencies for items to get selectivity
+ *
+ * The function also returns the frequency of the least frequent item
+ * on the MCV list, which may be useful for clamping estimate from the
+ * histogram (all items not present in the MCV list are less frequent).
+ * This however seems useful only for cases with conditions on all
+ * attributes.
+ *
+ * TODO This only handles AND-ed clauses, but it might work for OR-ed
+ *      lists too - it just needs to reverse the logic a bit. I.e. start
+ *      with 'no match' for all items, and mark the items as a match
+ *      as the clauses are processed (and skip items that are 'match').
+ */
+static Selectivity
+clauselist_mv_selectivity_mcvlist(PlannerInfo *root, List *clauses,
+								  MVStats mvstats, bool *fullmatch,
+								  Selectivity *lowsel)
+{
+	int i;
+	Selectivity s = 0.0;
+	ListCell * l;
+	MCVList mcvlist = NULL;
+	int	nmatches = 0;
+
+	char * matches = NULL;			/* match/mismatch for each MCV item */
+	Bitmapset *eqmatches = NULL;	/* attributes with equality matches */
+
+	/* there's no MCV list built yet */
+	if (! mvstats->mcv_built)
+		return 0.0;
+
+	mcvlist = deserialize_mv_mcvlist(fetch_mv_mcvlist(mvstats->mvoid));
+
+	Assert(mcvlist != NULL);
+	Assert(clauses != NIL);
+	Assert(mcvlist->nitems > 0);
+	Assert(list_length(clauses) >= 2);
+
+	/* by default all the MCV items match the clauses fully */
+	matches = palloc0(sizeof(char) * mcvlist->nitems);
+	memset(matches, MVSTATS_MATCH_FULL, sizeof(char)*mcvlist->nitems);
+
+	/* frequency of the lowest MCV item */
+	*lowsel = 1.0;
+
+	/* number of matching MCV items */
+	nmatches = mcvlist->nitems;
+
+	/*
+	 * Loop through the list of clauses, and for each of them evaluate
+	 * all the MCV items not yet eliminated by the preceding clauses.
+	 *
+	 * FIXME This would probably deserve a refactoring, I guess. Unify
+	 *       the two loops and put the checks inside, or something like
+	 *       that.
+	 */
+	foreach (l, clauses)
+	{
+		Node * clause = (Node*)lfirst(l);
+
+		/* if there are no remaining matches possible, we can stop */
+		if (nmatches == 0)
+			break;
+
+		/* it's either OpClause, or NullTest */
+		if (is_opclause(clause))
+		{
+			OpExpr * expr = (OpExpr*)clause;
+			bool		varonleft = true;
+			bool		ok;
+
+			/* operator */
+			FmgrInfo	opproc;
+
+			fmgr_info(get_opcode(expr->opno), &opproc);
+
+			ok = (NumRelids(clause) == 1) &&
+				 (is_pseudo_constant_clause(lsecond(expr->args)) ||
+				 (varonleft = false,
+				  is_pseudo_constant_clause(linitial(expr->args))));
+
+			if (ok)
+			{
+
+				FmgrInfo	ltproc, gtproc;
+				Var * var = (varonleft) ? linitial(expr->args) : lsecond(expr->args);
+				Const * cst = (varonleft) ? lsecond(expr->args) : linitial(expr->args);
+				bool isgt = (! varonleft);
+
+				/*
+				 * TODO Fetch only when really needed (probably for equality only)
+				 * TODO Technically either lt/gt is sufficient.
+				 * 
+				 * FIXME The code in analyze.c creates histograms only for types
+				 *       with enough ordering (by calling get_sort_group_operators).
+				 *       Is this the same assumption, i.e. are we certain that we
+				 *       get the ltproc/gtproc every time we ask? Or are there types
+				 *       where get_sort_group_operators returns ltopr and here we
+				 *       get nothing?
+				 */
+				TypeCacheEntry *typecache
+					= lookup_type_cache(var->vartype,
+										TYPECACHE_EQ_OPR | TYPECACHE_LT_OPR | TYPECACHE_GT_OPR);
+
+				/* FIXME proper matching attribute to dimension */
+				int idx = mv_get_index(var->varattno, mvstats->stakeys);
+
+				fmgr_info(get_opcode(typecache->lt_opr), &ltproc);
+				fmgr_info(get_opcode(typecache->gt_opr), &gtproc);
+
+				/*
+				 * Walk through the MCV items and evaluate the current clause. We can
+				 * skip items that were already ruled out, and terminate if there are
+				 * no remaining MCV items that might possibly match.
+				 */
+				for (i = 0; i < mcvlist->nitems; i++)
+				{
+					bool tmp;
+					MCVItem item = mcvlist->items[i];
+
+					/*
+					 * find the lowest selectivity in the MCV
+					 * FIXME Maybe not the best place do do this (in for all clauses).
+					 */
+					if (item->frequency < *lowsel)
+						*lowsel = item->frequency;
+
+					/* if there are no more matches, we can stop processing this clause */
+					if (nmatches == 0)
+						break;
+
+					/* skip MCV items that were already ruled out */
+					if (matches[i] == MVSTATS_MATCH_NONE)
+						continue;
+
+					/* TODO consider bsearch here (list is sorted by values)
+					 * TODO handle other operators too (LT, GT)
+					 * TODO identify "full match" when the clauses fully
+					 *      match the whole MCV list (so that checking the
+					 *      histogram is not needed)
+					 */
+					if (get_oprrest(expr->opno) == F_EQSEL)
+					{
+						/*
+						 * We don't care about isgt in equality, because it does not
+						 * matter whether it's (var = const) or (const = var).
+						 */
+						tmp = DatumGetBool(FunctionCall2Coll(&opproc,
+															 DEFAULT_COLLATION_OID,
+															 cst->constvalue,
+															 item->values[idx]));
+						if (! tmp)
+							matches[i] = MVSTATS_MATCH_NONE;
+						else
+							eqmatches = bms_add_member(eqmatches, idx);
+					}
+					else if (get_oprrest(expr->opno) == F_SCALARLTSEL)	/* column < constant */
+					{
+
+						if (! isgt)	/* (var < const) */
+						{
+							/*
+							 * First check whether the constant is below the lower boundary (in that 
+							 * case we can skip the bucket, because there's no overlap).
+							 */
+							tmp = DatumGetBool(FunctionCall2Coll(&opproc,
+																 DEFAULT_COLLATION_OID,
+																 cst->constvalue,
+																 item->values[idx]));
+
+							if (tmp)
+							{
+								matches[i] = MVSTATS_MATCH_NONE; /* no match */
+								continue;
+							}
+
+						} /* (get_oprrest(expr->opno) == F_SCALARLTSEL) */
+						else	/* (const < var) */
+						{
+							/*
+							 * First check whether the constant is above the upper boundary (in that 
+							 * case we can skip the bucket, because there's no overlap).
+							 */
+							tmp = DatumGetBool(FunctionCall2Coll(&opproc,
+																 DEFAULT_COLLATION_OID,
+																 item->values[idx],
+																 cst->constvalue));
+							if (tmp)
+							{
+								matches[i] = MVSTATS_MATCH_NONE; /* no match */
+								continue;
+							}
+						}
+					}
+					else if (get_oprrest(expr->opno) == F_SCALARGTSEL)	/* column > constant */
+					{
+
+						if (! isgt)	/* (var > const) */
+						{
+							/*
+							 * First check whether the constant is above the upper boundary (in that 
+							 * case we can skip the bucket, because there's no overlap).
+							 */
+							tmp = DatumGetBool(FunctionCall2Coll(&opproc,
+																 DEFAULT_COLLATION_OID,
+																 cst->constvalue,
+																 item->values[idx]));
+							if (tmp)
+							{
+								matches[i] = MVSTATS_MATCH_NONE; /* no match */
+								continue;
+							}
+
+						}
+						else /* (const > var) */
+						{
+							/*
+							 * First check whether the constant is below the lower boundary (in
+							 * that case we can skip the bucket, because there's no overlap).
+							 */
+							tmp = DatumGetBool(FunctionCall2Coll(&opproc,
+																 DEFAULT_COLLATION_OID,
+																 item->values[idx],
+																 cst->constvalue));
+							if (tmp)
+							{
+								matches[i] = MVSTATS_MATCH_NONE; /* no match */
+								continue;
+							}
+						}
+
+					} /* (get_oprrest(expr->opno) == F_SCALARGTSEL) */
+				}
+			}
+		}
+		else if (IsA(clause, NullTest))
+		{
+			NullTest * expr = (NullTest*)clause;
+			Var * var = (Var*)(expr->arg);
+
+			/* FIXME proper matching attribute to dimension */
+			int idx = mv_get_index(var->varattno, mvstats->stakeys);
+
+			/*
+			 * Walk through the MCV items and evaluate the current clause. We can
+			 * skip items that were already ruled out, and terminate if there are
+			 * no remaining MCV items that might possibly match.
+			 */
+			for (i = 0; i < mcvlist->nitems; i++)
+			{
+				MCVItem item = mcvlist->items[i];
+
+				/*
+				 * find the lowest selectivity in the MCV
+				 * FIXME Maybe not the best place do do this (in for all clauses).
+				 */
+				if (item->frequency < *lowsel)
+					*lowsel = item->frequency;
+
+				/* if there are no more matches, we can stop processing this clause */
+				if (nmatches == 0)
+					break;
+
+				/* skip MCV items that were already ruled out */
+				if (matches[i] == MVSTATS_MATCH_NONE)
+					continue;
+
+				/* if the clause mismatches the MCV item, set it as MATCH_NONE */
+				if ((expr->nulltesttype == IS_NULL)
+					&& (! mcvlist->items[i]->isnull[idx]))
+						matches[i] = MVSTATS_MATCH_NONE;
+				else if ((expr->nulltesttype == IS_NOT_NULL) &&
+						 (mcvlist->items[i]->isnull[idx]))
+						matches[i] = MVSTATS_MATCH_NONE;
+			}
+		}
+	}
+
+	/* sum frequencies for all the matching MCV items */
+	for (i = 0; i < mcvlist->nitems; i++)
+	{
+		if (matches[i] != MVSTATS_MATCH_NONE)
+			s += mcvlist->items[i]->frequency;
+	}
+
+	/*
+	 * If all the columns were matched by equality, it's a full match.
+	 * In this case there can be just a single MCV item, matching the
+	 * clause (if there were two, both would match the other one).
+	 */
+	*fullmatch = (bms_num_members(eqmatches) == mcvlist->ndimensions);
+
+	pfree(matches);
+	pfree(mcvlist);
+
+	return s;
+}
diff --git a/src/backend/utils/mvstats/Makefile b/src/backend/utils/mvstats/Makefile
index 099f1ed..3c0aff4 100644
--- a/src/backend/utils/mvstats/Makefile
+++ b/src/backend/utils/mvstats/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mvstats
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = common.o dependencies.o
+OBJS = common.o mcv.o dependencies.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mvstats/common.c b/src/backend/utils/mvstats/common.c
index 0edaaa6..69ab805 100644
--- a/src/backend/utils/mvstats/common.c
+++ b/src/backend/utils/mvstats/common.c
@@ -16,6 +16,10 @@
 
 #include "common.h"
 
+static VacAttrStats ** lookup_var_attr_stats(int2vector *attrs,
+											 int natts,
+											 VacAttrStats **vacattrstats);
+
 /*
  * Compute requested multivariate stats, using the rows sampled for the
  * plain (single-column) stats.
@@ -40,10 +44,15 @@ build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
 	for (i = 0; i < nmvstats; i++)
 	{
 		MVDependencies	deps  = NULL;
+		MCVList		mcvlist   = NULL;
+		int numrows_filtered  = 0;
 
 		/* int2 vector of attnums the stats should be computed on */
 		int2vector * attrs = mvstats[i].stakeys;
 
+		/* filter only the interesting vacattrstats records */
+		VacAttrStats **stats = lookup_var_attr_stats(attrs, natts, vacattrstats);
+
 		/* check allowed number of dimensions */
 		Assert((attrs->dim1 >= 2) && (attrs->dim1 <= MVSTATS_MAX_DIMENSIONS));
 
@@ -51,10 +60,14 @@ build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
 		 * Analyze functional dependencies of columns.
 		 */
 		if (mvstats->deps_enabled)
-			deps = build_mv_dependencies(numrows, rows, attrs, natts, vacattrstats);
+			deps = build_mv_dependencies(numrows, rows, attrs, stats);
+
+		/* build the MCV list */
+		if (mvstats->mcv_enabled)
+			mcvlist = build_mv_mcvlist(numrows, rows, attrs, stats, &numrows_filtered);
 
 		/* store the histogram / MCV list in the catalog */
-		update_mv_stats(mvstats[i].mvoid, deps);
+		update_mv_stats(mvstats[i].mvoid, deps, mcvlist, attrs, stats);
 	}
 }
 
@@ -63,7 +76,7 @@ build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
  * matching the attrs vector (to make it easy to work with when
  * computing multivariate stats).
  */
-VacAttrStats **
+static VacAttrStats **
 lookup_var_attr_stats(int2vector *attrs, int natts, VacAttrStats **vacattrstats)
 {
 	int i, j;
@@ -136,7 +149,7 @@ list_mv_stats(Oid relid, int *nstats, bool built_only)
 		 * Skip statistics that were not computed yet (if only stats
 		 * that were already built were requested)
 		 */
-		if (built_only && (! stats->deps_built))
+		if (built_only && (! (stats->mcv_built || stats->deps_built)))
 			continue;
 
 		/* double the array size if needed */
@@ -149,7 +162,9 @@ list_mv_stats(Oid relid, int *nstats, bool built_only)
 		result[*nstats].mvoid = HeapTupleGetOid(htup);
 		result[*nstats].stakeys = buildint2vector(stats->stakeys.values, stats->stakeys.dim1);
 		result[*nstats].deps_enabled = stats->deps_enabled;
+		result[*nstats].mcv_enabled = stats->mcv_enabled;
 		result[*nstats].deps_built = stats->deps_built;
+		result[*nstats].mcv_built = stats->mcv_built;
 		*nstats += 1;
 	}
 
@@ -164,7 +179,9 @@ list_mv_stats(Oid relid, int *nstats, bool built_only)
 }
 
 void
-update_mv_stats(Oid mvoid, MVDependencies dependencies)
+update_mv_stats(Oid mvoid,
+				MVDependencies dependencies, MCVList mcvlist,
+				int2vector *attrs, VacAttrStats **stats)
 {
 	HeapTuple	stup,
 				oldtup;
@@ -189,15 +206,26 @@ update_mv_stats(Oid mvoid, MVDependencies dependencies)
 			= PointerGetDatum(serialize_mv_dependencies(dependencies));
 	}
 
+	if (mcvlist != NULL)
+	{
+		bytea * data = serialize_mv_mcvlist(mcvlist, attrs, stats);
+		nulls[Anum_pg_mv_statistic_stamcv -1]    = (data == NULL);
+		values[Anum_pg_mv_statistic_stamcv  - 1] = PointerGetDatum(data);
+	}
+
 	/* always replace the value (either by bytea or NULL) */
 	replaces[Anum_pg_mv_statistic_stadeps -1] = true;
+	replaces[Anum_pg_mv_statistic_stamcv -1] = true;
 
 	/* always change the availability flags */
 	nulls[Anum_pg_mv_statistic_deps_built -1] = false;
+	nulls[Anum_pg_mv_statistic_mcv_built -1] = false;
 
 	replaces[Anum_pg_mv_statistic_deps_built-1] = true;
+	replaces[Anum_pg_mv_statistic_mcv_built  -1] = true;
 
 	values[Anum_pg_mv_statistic_deps_built-1] = BoolGetDatum(dependencies != NULL);
+	values[Anum_pg_mv_statistic_mcv_built  -1] = BoolGetDatum(mcvlist != NULL);
 
 	/* Is there already a pg_mv_statistic tuple for this attribute? */
 	oldtup = SearchSysCache1(MVSTATOID,
@@ -225,6 +253,21 @@ update_mv_stats(Oid mvoid, MVDependencies dependencies)
 	heap_close(sd, RowExclusiveLock);
 }
 
+
+int
+mv_get_index(AttrNumber varattno, int2vector * stakeys)
+{
+       int i, idx = 0;
+       for (i = 0; i < stakeys->dim1; i++)
+       {
+               if (stakeys->values[i] < varattno)
+                       idx += 1;
+               else
+                       break;
+       }
+       return idx;
+}
+
 /* multi-variate stats comparator */
 
 /*
@@ -235,11 +278,15 @@ update_mv_stats(Oid mvoid, MVDependencies dependencies)
 int
 compare_scalars_simple(const void *a, const void *b, void *arg)
 {
-	Datum		da = *(Datum*)a;
-	Datum		db = *(Datum*)b;
-	SortSupport ssup= (SortSupport) arg;
+	return compare_datums_simple(*(Datum*)a,
+								 *(Datum*)b,
+								 (SortSupport)arg);
+}
 
-	return ApplySortComparator(da, false, db, false, ssup);
+int
+compare_datums_simple(Datum a, Datum b, SortSupport ssup)
+{
+	return ApplySortComparator(a, false, b, false, ssup);
 }
 
 /*
diff --git a/src/backend/utils/mvstats/common.h b/src/backend/utils/mvstats/common.h
index b98ceb7..fca2782 100644
--- a/src/backend/utils/mvstats/common.h
+++ b/src/backend/utils/mvstats/common.h
@@ -59,6 +59,14 @@ typedef struct
 	int		   *tupnoLink;
 } CompareScalarsContext;
 
+/* (de)serialization info */
+typedef struct DimensionInfo {
+	int		nvalues;	/* number of deduplicated values */
+	int		nbytes;		/* number of bytes (serialized) */
+	int		typlen;		/* pg_type.typlen */
+	bool	typbyval;	/* pg_type.typbyval */
+} DimensionInfo;
+
 /* multi-sort */
 typedef struct MultiSortSupportData {
 	int				ndims;		/* number of dimensions supported by the */
@@ -82,10 +90,8 @@ int multi_sort_compare(const void *a, const void *b, void *arg);
 int multi_sort_compare_dim(int dim, const SortItem *a,
 						   const SortItem *b, MultiSortSupport mss);
 
-VacAttrStats ** lookup_var_attr_stats(int2vector *attrs,
-									  int natts, VacAttrStats **vacattrstats);
-
 /* comparators, used when constructing multivariate stats */
+int compare_datums_simple(Datum a, Datum b, SortSupport ssup);
 int compare_scalars_simple(const void *a, const void *b, void *arg);
 int compare_scalars_partition(const void *a, const void *b, void *arg);
 int compare_scalars_memcmp(const void *a, const void *b, void *arg);
diff --git a/src/backend/utils/mvstats/dependencies.c b/src/backend/utils/mvstats/dependencies.c
index 93a2fa6..0543690 100644
--- a/src/backend/utils/mvstats/dependencies.c
+++ b/src/backend/utils/mvstats/dependencies.c
@@ -224,7 +224,7 @@
  */
 MVDependencies
 build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
-					  int natts, VacAttrStats **vacattrstats)
+					  VacAttrStats **stats)
 {
 	int i;
 	int numattrs = attrs->dim1;
@@ -245,13 +245,6 @@ build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
 	/* dimension indexes we'll check for associations [a => b] */
 	int dima, dimb;
 
-	/* info for the interesting attributes only
-	 *
-	 * TODO Compute this only once and pass it to all the methods
-	 *      that need it.
-	 */
-	VacAttrStats **stats = lookup_var_attr_stats(attrs, natts, vacattrstats);
-
 	/*
 	 * We'll reuse the same array for all the 2-column combinations.
 	 *
@@ -287,7 +280,7 @@ build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
 	for (dima = 0; dima < numattrs; dima++)
 	{
 		/* prepare the sort function for the first dimension */
-		multi_sort_add_dimension(mss, 0, dima, vacattrstats);
+		multi_sort_add_dimension(mss, 0, dima, stats);
 
 		for (dimb = 0; dimb < numattrs; dimb++)
 		{
@@ -309,7 +302,7 @@ build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
 				continue;
 
 			/* prepare the sort function for the second dimension */
-			multi_sort_add_dimension(mss, 1, dimb, vacattrstats);
+			multi_sort_add_dimension(mss, 1, dimb, stats);
 
 			/* reset the values and isnull flags */
 			memset(values, 0, sizeof(Datum) * numrows * 2);
diff --git a/src/backend/utils/mvstats/mcv.c b/src/backend/utils/mvstats/mcv.c
new file mode 100644
index 0000000..2b3d171
--- /dev/null
+++ b/src/backend/utils/mvstats/mcv.c
@@ -0,0 +1,1002 @@
+/*-------------------------------------------------------------------------
+ *
+ * mcv.c
+ *	  POSTGRES multivariate MCV lists
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mvstats/mcv.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "common.h"
+
+/*
+ * Multivariate MCVs (most-common values lists) are a straightforward
+ * extension of regular MCV list by tracking combinations of values for
+ * several attributes (columns), including NULL flags, and frequency
+ * of the combination.
+ *
+ * For columns small number of distinct values, this works quite well
+ * and may represent the distribution pretty exactly. For columns with
+ * large number of distinct values (e.g. stored as FLOAT), this does
+ * not work that well.
+ *
+ * If we can represent the distribution as a MCV list, we can estimate
+ * some clauses (e.g. equality clauses) much accurately than using
+ * histograms for example.
+ *
+ * Discrete distributions are also easier to combine into a larger
+ * distribution (but this is not yet implemented).
+ *
+ *
+ * TODO For types that don't reasonably support ordering (either because
+ *      the type does not support that or when the user adds some option
+ *      to the ADD STATISTICS command - e.g. UNSORTED_STATS), building
+ *      the histogram may be pointless and inefficient. This is esp.
+ *      true for varlena types that may be quite large and a large MCV
+ *      list may be a better choice, because it makes equality estimates
+ *      more accurate. Due to the unsorted nature, range queries on those
+ *      attributes are rather useless anyway.
+ *
+ *      Another thing is that by restricting to MCV list and equality
+ *      conditions, we can use hash values instead of long varlena values.
+ *      The equality estimation will be very accurate.
+ *
+ *      This however complicates matching the columns to available
+ *      statistics, as it will require matching clauses (not columns) to
+ *      stats. And it may get quite complex - e.g. what if there are
+ *      multiple clauses, each compatible with different stats subset?
+ * 
+ *
+ * Selectivity estimation
+ * ----------------------
+ * The estimation, implemented in clauselist_mv_selectivity_mcvlist(),
+ * is quite simple in principle - walk through the MCV items and sum
+ * frequencies of all the items that match all the clauses.
+ *
+ * The current implementation uses MCV lists to estimates those types
+ * of clauses (think of WHERE conditions):
+ *
+ *  (a) equality clauses    WHERE (a = 1) AND (b = 2)
+ *
+ *  (b) inequality clauses  WHERE (a < 1) AND (b >= 2)
+ *
+ * It's possible to add more clauses, for example:
+ *
+ *  (a) NULL clauses        WHERE (a IS NULL) AND (b IS NOT NULL)
+ *
+ *  (b) multi-var clauses   WHERE (a > b)
+ *
+ * and so on. These are tasks for the future, not yet implemented.
+ *
+ *
+ * Estimating equality clauses
+ * ---------------------------
+ * When computing selectivity estimate for equality clauses
+ *
+ *      (a = 1) AND (b = 2)
+ *
+ * we can do this estimate pretty exactly assuming that two conditions
+ * are met:
+ *
+ *      (1) there's an equality condition on each attribute
+ *
+ *      (2) we find a matching item in the MCV list
+ *
+ * In that case we know the MCV item represents all the tuples matching
+ * the clauses, and the selectivity estimate is complete. This is what
+ * we call 'full match'.
+ *
+ * When only (1) holds, but there's no matching MCV item, we don't know
+ * whether there are no such rows or just are not very frequent. We can
+ * however use the frequency of the least frequent MCV item as an upper
+ * bound for the selectivity.
+ *
+ * If the equality conditions match only a subset of the attributes
+ * the MCV list is built on (i.e. we can't get a full match - we may get
+ * multiple MCV items matching the clauses, but even if we get a single
+ * match there may be items that did not get into the MCV list. But in
+ * this case we can still use the frequency of the last MCV item to clam
+ * the 'additional' selectivity not accounted for by the matching items.
+ *
+ * If there's no histogram, because the MCV list approximates the
+ * distribution accurately (not because the histogram was disabled),
+ * it does not really matter whether there are equality conditions on
+ * all the columns - we can do pretty accurate estimation using the MCV.
+ * 
+ * TODO For a combination of equality conditions (not full-match case)
+ *      we probably can clamp the selectivity by the minimum of
+ *      selectivities for each condition. For example if we know the
+ *      number of distinct values for each column, we can use 1/ndistinct
+ *      as a per-column estimate. Or rather 1/ndistinct + selectivity
+ *      derived from the MCV list.
+ *
+ *      If we know the estimate of number of combinations of the columns
+ *      (i.e. ndistinct(A,B)), we may estimate the average frequency of
+ *      items in the remaining 10% as [10% / ndistinct(A,B)].
+ *
+ *
+ * Bounding estimates
+ * ------------------
+ * In general the MCV lists may not provide estimates as accurate as
+ * for the full-match equality case, but may provide some useful
+ * lower/upper boundaries for the estimation error.
+ *
+ * With equality clauses we can do a few more tricks to narrow this
+ * error range (see the previous section and TODO), but with inequality
+ * clauses (or generally non-equality clauses), it's rather dificult.
+ * There's nothing like a 'full match' - we have to consider both the
+ * MCV items and the remaining part every time. We can't use the minimum
+ * selectivity of MCV items, as the clauses may match multiple items.
+ *
+ * For example with a MCV list on columns (A, B), covering 90% of the
+ * table (computed while building the MCV list), about ~10% of the table
+ * is not represented by the MCV list. So even if the conditions match
+ * all the remaining rows (not represented by the MCV items), we can't
+ * get selectivity higher than those 10%. We may use 1/2 the remaining
+ * selectivity as an estimate (minimizing average error).
+ *
+ * TODO Most of these ideas (error limiting) are not yet implemented.
+ *
+ *
+ * General TODO
+ * ------------
+ *
+ * FIXME Use max_mcv_items from ALTER TABLE ADD STATISTICS command.
+ *
+ * TODO Add support for IS [NOT] NULL clauses, and clauses referencing
+ *      multiple columns (a < b).
+ *
+ * TODO It's possible to build a special case of MCV list, storing not
+ *      the actual values but only 32/64-bit hash. This is only useful
+ *      for estimating equality clauses and for large varlena types,
+ *      which are very impractical for plain MCV list because of size.
+ *      But for those data types we really want just the equality
+ *      clauses, so it's actually a good solution.
+ *
+ * TODO Currently there's no logic to consider building only a MCV list
+ *      (and not building the histogram at all), except for doing this
+ *      decision manually in ADD STATISTICS.
+ */
+
+/*
+ * Each serialized item needs to store (in this order):
+ *
+ * - indexes              (ndim * sizeof(int32))
+ * - null flags           (ndim * sizeof(bool))
+ * - frequency            (sizeof(double))
+ *
+ * So in total:
+ *
+ *   ndim * (sizeof(int32) + sizeof(bool)) + sizeof(double)
+ */
+#define ITEM_SIZE(ndims)	\
+	(ndims * (sizeof(int32) + sizeof(bool)) + sizeof(double))
+
+/* pointers into a flat serialized item of ITEM_SIZE(n) bytes */
+#define ITEM_INDEXES(item)			((int32*)item)
+#define ITEM_NULLS(item,ndims)		((bool*)(ITEM_INDEXES(item) + ndims))
+#define ITEM_FREQUENCY(item,ndims)	((double*)(ITEM_NULLS(item,ndims) + ndims))
+
+/*
+ * Builds MCV list from sample rows, and removes rows represented by
+ * the MCV list from the sample (the number of remaining sample rows is
+ * returned by the numrows_filtered parameter).
+ *
+ * The method is quite simple - in short it does about these steps:
+ *
+ *       (1) sort the data (default collation, '<' for the data type)
+ *
+ *       (2) count distinct groups, decide how many to keep
+ *
+ *       (3) build the MCV list using the threshold determined in (2)
+ *
+ *       (4) remove rows represented by the MCV from the sample
+ *
+ * For more details, see the comments in the code.
+ *
+ * FIXME Single-dimensional MCV is sorted by frequency (descending). We
+ *       should do that too, because when walking through the list we
+ *       want to check the most frequent items first.
+ *
+ * TODO We're using Datum (8B), even for data types (e.g. int4 or
+ *      float4). Maybe we could save some space here, but the bytea
+ *      compression should handle it just fine.
+ *
+ * TODO This probably should not use the ndistinct directly (as computed
+ *      from the table, but rather estimate the number of distinct
+ *      values in the table), no?
+ */
+MCVList
+build_mv_mcvlist(int numrows, HeapTuple *rows, int2vector *attrs,
+					  VacAttrStats **stats, int *numrows_filtered)
+{
+	int i, j;
+	int numattrs = attrs->dim1;
+	int ndistinct = 0;
+	int mcv_threshold = 0;
+	int count = 0;
+	int nitems = 0;
+
+	MCVList	mcvlist = NULL;
+
+	/* Sort by multiple columns (using array of SortSupport) */
+	MultiSortSupport mss = multi_sort_init(numattrs);
+
+	/*
+	 * Preallocate space for all the items as a single chunk, and point
+	 * the items to the appropriate parts of the array.
+	 */
+	SortItem   *items  = (SortItem*)palloc0(numrows * sizeof(SortItem));
+	Datum	   *values = (Datum*)palloc0(sizeof(Datum) * numrows * numattrs);
+	bool	   *isnull = (bool*)palloc0(sizeof(bool) * numrows * numattrs);
+
+	/* keep all the rows by default (as if there was no MCV list) */
+	*numrows_filtered = numrows;
+
+	for (i = 0; i < numrows; i++)
+	{
+		items[i].values = &values[i * numattrs];
+		items[i].isnull = &isnull[i * numattrs];
+	}
+
+	/* load the values/null flags from sample rows */
+	for (j = 0; j < numrows; j++)
+		for (i = 0; i < numattrs; i++)
+			items[j].values[i] = heap_getattr(rows[j], attrs->values[i],
+								stats[i]->tupDesc, &items[j].isnull[i]);
+
+	/* prepare the sort functions for all the attributes */
+	for (i = 0; i < numattrs; i++)
+		multi_sort_add_dimension(mss, i, i, stats);
+
+	/* do the sort, using the multi-sort */
+	qsort_arg((void *) items, numrows, sizeof(SortItem),
+				multi_sort_compare, mss);
+
+	/*
+	 * Count the number of distinct groups - just walk through the
+	 * sorted list and count the number of key changes. We use this to
+	 * determine the threshold (125% of the average frequency).
+	 */
+	ndistinct = 1;
+	for (i = 1; i < numrows; i++)
+		if (multi_sort_compare(&items[i], &items[i-1], mss) != 0)
+			ndistinct += 1;
+
+	/*
+	 * Determine how many groups actually exceed the threshold, and then
+	 * walk the array again and collect them into an array. We'll always
+	 * require at least 4 rows per group.
+	 *
+	 * But if we can fit all the distinct values in the MCV list (i.e.
+	 * if there are less distinct groups than MVSTAT_MCVLIST_MAX_ITEMS),
+	 * we'll require only 2 rows per group.
+	 *
+	 * TODO For now the threshold is the same as in the single-column
+	 *      case (average + 25%), but maybe that's worth revisiting
+	 *      for the multivariate case.
+	 *
+	 * TODO We can do this only if we believe we got all the distinct
+	 *      values of the table.
+	 *
+	 * FIXME This should really reference mcv_max_items (from catalog)
+	 *       instead of the constant MVSTAT_MCVLIST_MAX_ITEMS.
+	 */
+	mcv_threshold = 1.25 * numrows / ndistinct;
+	mcv_threshold = (mcv_threshold < 4) ? 4  : mcv_threshold;
+
+	if (ndistinct <= MVSTAT_MCVLIST_MAX_ITEMS)
+		mcv_threshold = 2;
+
+	/*
+	 * Walk through the sorted data again, and see how many groups
+	 * reach the mcv_threshold (and become an item in the MCV list).
+	 */
+	count = 1;
+	for (i = 1; i <= numrows; i++)
+	{
+		/* last row or new group, so check if we exceed  mcv_threshold */
+		if ((i == numrows) || (multi_sort_compare(&items[i], &items[i-1], mss) != 0))
+		{
+			/* group hits the threshold, count the group as MCV item */
+			if (count >= mcv_threshold)
+				nitems += 1;
+
+			count = 1;
+		}
+		else	/* within group, so increase the number of items */
+			count += 1;
+	}
+
+	/* we know the number of MCV list items, so let's build the list */
+	if (nitems > 0)
+	{
+		/* allocate the MCV list structure, set parameters we know */
+		mcvlist = (MCVList)palloc0(sizeof(MCVListData));
+
+		mcvlist->magic = MVSTAT_MCV_MAGIC;
+		mcvlist->type = MVSTAT_MCV_TYPE_BASIC;
+		mcvlist->ndimensions = numattrs;
+		mcvlist->nitems = nitems;
+
+		/*
+		 * Preallocate Datum/isnull arrays (not as a single chunk, as
+		 * we'll pass this outside this method and thus it needs to be
+		 * easy to pfree() the data (and we wouldn't know where the
+		 * arrays start).
+		 *
+		 * TODO Maybe the reasoning that we can't allocate a single
+		 *      piece because we're passing it out is bogus? Who'd
+		 *      free a single item of the MCV list, anyway?
+		 *
+		 * TODO Maybe with a proper encoding (stuffing all the values
+		 *      into a list-level array, this will be untrue)?
+		 */
+		mcvlist->items = (MCVItem*)palloc0(sizeof(MCVItem)*nitems);
+
+		for (i = 0; i < nitems; i++)
+		{
+			mcvlist->items[i] = (MCVItem)palloc0(sizeof(MCVItemData));
+			mcvlist->items[i]->values = (Datum*)palloc0(sizeof(Datum)*numattrs);
+			mcvlist->items[i]->isnull = (bool*)palloc0(sizeof(bool)*numattrs);
+		}
+
+		/*
+		 * Repeat the same loop as above, but this time copy the data
+		 * into the MCV list (for items exceeding the threshold).
+		 *
+		 * TODO Maybe we could simply remember indexes of the last item
+		 *      in each group (from the previous loop)?
+		 */
+		count = 1;
+		nitems = 0;
+		for (i = 1; i <= numrows; i++)
+		{
+			/* last row or a new group */
+			if ((i == numrows) || (multi_sort_compare(&items[i], &items[i-1], mss) != 0))
+			{
+				/* count the MCV item if exceeding the threshold (and copy into the array) */
+				if (count >= mcv_threshold)
+				{
+					/* just pointer to the proper place in the list */
+					MCVItem item = mcvlist->items[nitems];
+
+					/* copy values from the _previous_ group (last item of) */
+					memcpy(item->values, items[(i-1)].values, sizeof(Datum) * numattrs);
+					memcpy(item->isnull, items[(i-1)].isnull, sizeof(bool)  * numattrs);
+
+
+					/* and finally the group frequency */
+					item->frequency = (double)count / numrows;
+
+					/* next item */
+					nitems += 1;
+				}
+
+				count = 1;
+			}
+			else	/* same group, just increase the number of items */
+				count += 1;
+		}
+
+		/* make sure the loops are consistent */
+		Assert(nitems == mcvlist->nitems);
+
+		/*
+		 * Remove the rows matching the MCV list (i.e. keep only rows
+		 * that are not represented by the MCV list).
+		 *
+		 * FIXME This implementation is rather naive, effectively O(N^2).
+		 *       As the MCV list grows, the check will take longer and
+		 *       longer. And as the number of sampled rows increases (by
+		 *       increasing statistics target), it will take longer and
+		 *       longer. One option is to sort the MCV items first and
+		 *       then perform a binary search.
+		 *
+		 *       A better option would be keeping the ID of the row in
+		 *       the sort item, and then just walk through the items and
+		 *       mark rows to remove (in a bitmap of the same size).
+		 *       There's not space for that in SortItem at this moment,
+		 *       but it's trivial to add 'private' pointer, or just
+		 *       using another structure with extra field (starting with
+		 *       SortItem, so that the comparators etc. still work).
+		 *
+		 *       Another option is to use the sorted array of items
+		 *       (because that's how we sorted the source data), and
+		 *       simply do a bsearch() into it. If we find a matching
+		 *       item, the row belongs to the MCV list.
+		 */
+		if (nitems == ndistinct) /* all rows are covered by MCV items */
+			*numrows_filtered = 0;
+		else /* (nitems < ndistinct) && (nitems > 0) */
+		{
+			int nfiltered = 0;
+			HeapTuple *rows_filtered = (HeapTuple*)palloc0(sizeof(HeapTuple) * numrows);
+
+			/* used for the searches */
+			SortItem item, mcvitem;;
+
+			item.values = (Datum*)palloc0(numattrs * sizeof(Datum));
+			item.isnull = (bool*)palloc0(numattrs * sizeof(bool));
+
+			/*
+			 * FIXME we don't need to allocate this, we can reference
+			 *       the MCV item directly ...
+			 */
+			mcvitem.values = (Datum*)palloc0(numattrs * sizeof(Datum));
+			mcvitem.isnull = (bool*)palloc0(numattrs * sizeof(bool));
+
+			/* walk through the tuples, compare the values to MCV items */
+			for (i = 0; i < numrows; i++)
+			{
+				bool	match = false;
+
+				/* collect the key values from the row */
+				for (j = 0; j < numattrs; j++)
+					item.values[j] = heap_getattr(rows[i], attrs->values[j],
+										stats[j]->tupDesc, &item.isnull[j]);
+
+				/* scan through the MCV list for matches */
+				for (j = 0; j < mcvlist->nitems; j++)
+				{
+					/*
+					 * TODO Create a SortItem/MCVItem comparator so that
+					 *      we don't need to do memcpy() like crazy.
+					 */
+					memcpy(mcvitem.values, mcvlist->items[j]->values,
+							numattrs * sizeof(Datum));
+					memcpy(mcvitem.isnull, mcvlist->items[j]->isnull,
+							numattrs * sizeof(bool));
+
+					if (multi_sort_compare(&item, &mcvitem, mss) == 0)
+					{
+						match = true;
+						break;
+					}
+				}
+
+				/* if no match in the MCV list, copy the row into the filtered ones */
+				if (! match)
+					memcpy(&rows_filtered[nfiltered++], &rows[i], sizeof(HeapTuple));
+			}
+
+			/* replace the rows and remember how many rows we kept */
+			memcpy(rows, rows_filtered, sizeof(HeapTuple) * nfiltered);
+			*numrows_filtered = nfiltered;
+
+			/* free all the data used here */
+			pfree(rows_filtered);
+			pfree(item.values);
+			pfree(item.isnull);
+			pfree(mcvitem.values);
+			pfree(mcvitem.isnull);
+		}
+	}
+
+	pfree(values);
+	pfree(items);
+	pfree(isnull);
+
+	return mcvlist;
+}
+
+
+/* fetch the MCV list (as a bytea) from the pg_mv_statistic catalog */
+bytea *
+fetch_mv_mcvlist(Oid mvoid)
+{
+	Relation	indrel;
+	SysScanDesc indscan;
+	ScanKeyData skey;
+	HeapTuple	htup;
+	bytea	   *mcvlist = NULL;
+
+	/* Prepare to scan pg_mv_statistic for entries having indrelid = this rel. */
+	ScanKeyInit(&skey,
+				ObjectIdAttributeNumber,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(mvoid));
+
+	indrel = heap_open(MvStatisticRelationId, AccessShareLock);
+	indscan = systable_beginscan(indrel, MvStatisticOidIndexId, true,
+								 NULL, 1, &skey);
+
+	while (HeapTupleIsValid(htup = systable_getnext(indscan)))
+	{
+		bool isnull = false;
+		Datum tmp  = SysCacheGetAttr(MVSTATOID, htup,
+								   Anum_pg_mv_statistic_stamcv, &isnull);
+
+		Assert(!isnull);
+
+		mcvlist = DatumGetByteaP(tmp);
+
+		break;
+	}
+
+	systable_endscan(indscan);
+
+	heap_close(indrel, AccessShareLock);
+
+	/* TODO maybe save the list into relcache, as in RelationGetIndexList
+	 *      (which was used as an inspiration of this one)?. */
+
+	return mcvlist;
+}
+
+/* print some basic info about the MCV list
+ *
+ * TODO Add info about what part of the table this covers.
+ */
+Datum
+pg_mv_stats_mcvlist_info(PG_FUNCTION_ARGS)
+{
+	bytea	   *data = PG_GETARG_BYTEA_P(0);
+	char	   *result;
+
+	MCVList	mcvlist = deserialize_mv_mcvlist(data);
+
+	result = palloc0(128);
+	snprintf(result, 128, "nitems=%d", mcvlist->nitems);
+
+	pfree(mcvlist);
+
+	PG_RETURN_TEXT_P(cstring_to_text(result));
+}
+
+/* used to pass context into bsearch() */
+static SortSupport ssup_private = NULL;
+
+static int bsearch_comparator(const void * a, const void * b);
+
+/*
+ * Serialize MCV list into a bytea value. The basic algorithm is simple:
+ *
+ * (1) perform deduplication for each attribute (separately)
+ *     (a) collect all (non-NULL) attribute values from all MCV items
+ *     (b) sort the data (using 'lt' from VacAttrStats)
+ *     (c) remove duplicate values from the array
+ *
+ * (2) serialize the arrays into a bytea value
+ *
+ * (3) process all MCV list items
+ *     (a) replace values with indexes into the arrays
+ *
+ * Each attribute has to be processed separately, because we're mixing
+ * different datatypes, and we don't know what equality means for them.
+ * We're also mixing pass-by-value and pass-by-ref types, and so on.
+ *
+ * We'll use 32-bit values for the indexes in step (3), although we
+ * could probably use just 16 bits as we don't allow more than 8k
+ * items in the MCV list max_mcv_items (well, we might increase this to
+ * 32k and still fit into signed 16-bits). But let's be lazy and rely
+ * on the varlena compression to kick in. If most bytes will be 0x00
+ * so it should work nicely.
+ *
+ * FIXME This probably leaks memory, or at least uses it inefficiently
+ *       (many small palloc() calls instead of a large one).
+ *
+ * TODO Consider using 16-bit values for the indexes in step (3).
+ *
+ * TODO Consider packing boolean flags (NULL) for each item into 'char'
+ *      or a longer type (instead of using an array of bool items).
+ */
+bytea *
+serialize_mv_mcvlist(MCVList mcvlist, int2vector *attrs,
+					 VacAttrStats **stats)
+{
+	int	i, j;
+	int	ndims = mcvlist->ndimensions;
+	int	itemsize = ITEM_SIZE(ndims);
+
+	Size	total_length = 0;
+
+	char   *item = palloc0(itemsize);
+
+	/* serialized items (indexes into arrays, etc.) */
+	bytea  *output;
+	char   *data = NULL;
+
+	/* values per dimension (and number of non-NULL values) */
+	Datum **values = (Datum**)palloc0(sizeof(Datum*) * ndims);
+	int	   *counts = (int*)palloc0(sizeof(int) * ndims);
+
+	/* info about dimensions (for deserialize) */
+	DimensionInfo * info
+				= (DimensionInfo *)palloc0(sizeof(DimensionInfo)*ndims);
+
+	/* sort support data */
+	SortSupport	ssup = (SortSupport)palloc0(sizeof(SortSupportData)*ndims);
+
+	/* collect and deduplicate values for each dimension */
+	for (i = 0; i < ndims; i++)
+	{
+		int count;
+		StdAnalyzeData *tmp = (StdAnalyzeData *)stats[i]->extra_data;
+
+		/* keep important info about the data type */
+		info[i].typlen   = stats[i]->attrtype->typlen;
+		info[i].typbyval = stats[i]->attrtype->typbyval;
+
+		/* allocate space for all values, including NULLs (won't use them) */
+		values[i] = (Datum*)palloc0(sizeof(Datum) * mcvlist->nitems);
+
+		for (j = 0; j < mcvlist->nitems; j++)
+		{
+			if (! mcvlist->items[j]->isnull[i])	/* skip NULL values */
+			{
+				values[i][counts[i]] = mcvlist->items[j]->values[i];
+				counts[i] += 1;
+			}
+		}
+
+		/* there are just NULL values in this dimension */
+		if (counts[i] == 0)
+			continue;
+
+		/* sort and deduplicate */
+		ssup[i].ssup_cxt = CurrentMemoryContext;
+		ssup[i].ssup_collation = DEFAULT_COLLATION_OID;
+		ssup[i].ssup_nulls_first = false;
+
+		PrepareSortSupportFromOrderingOp(tmp->ltopr, &ssup[i]);
+
+		qsort_arg(values[i], counts[i], sizeof(Datum),
+										compare_scalars_simple, &ssup[i]);
+
+		/*
+		 * Walk through the array and eliminate duplicitate values, but
+		 * keep the ordering (so that we can do bsearch later). We know
+		 * there's at least 1 item, so we can skip the first element.
+		 */
+		count = 1;	/* number of deduplicated items */
+		for (j = 1; j < counts[i]; j++)
+		{
+			/* if it's different from the previous value, we need to keep it */
+			if (compare_datums_simple(values[i][j-1], values[i][j], &ssup[i]) != 0)
+			{
+				/* XXX: not needed if (count == j) */
+				values[i][count] = values[i][j];
+				count += 1;
+			}
+		}
+
+		/* keep info about the deduplicated count */
+		info[i].nvalues = count;
+
+		/* compute size of the serialized data */
+		if (info[i].typbyval)
+			/*
+			 * passed by value, so just Datum array (int4, int8, ...)
+			 *
+			 * TODO Might save a few bytes here, by storing just typlen
+			 *      bytes instead of whole Datum (8B) on 64-bits.
+			 */
+			info[i].nbytes = info[i].nvalues * sizeof(Datum);
+		else if (info[i].typlen > 0)
+			/* pased by reference, but fixed length (name, tid, ...) */
+			info[i].nbytes = info[i].nvalues * info[i].typlen;
+		else if (info[i].typlen == -1)
+			/* varlena, so just use VARSIZE_ANY */
+			for (j = 0; j < info[i].nvalues; j++)
+				info[i].nbytes += VARSIZE_ANY(values[i][j]);
+		else if (info[i].typlen == -2)
+			/* cstring, so simply strlen */
+			for (j = 0; j < info[i].nvalues; j++)
+				info[i].nbytes += strlen(DatumGetPointer(values[i][j]));
+		else
+			elog(ERROR, "unknown data type typbyval=%d typlen=%d",
+				info[i].typbyval, info[i].typlen);
+	}
+
+	/*
+	 * Now we finally know how much space we'll need for the serialized
+	 * MCV list, as it contains these fields:
+	 *
+	 * - length (4B) for varlena
+	 * - magic (4B)
+	 * - type (4B)
+	 * - ndimensions (4B)
+	 * - nitems (4B)
+	 * - info (ndim * sizeof(DimensionInfo)
+	 * - arrays of values for each dimension
+	 * - serialized items (nitems * itemsize)
+	 * 
+	 * So the 'header' size is 20B + ndim * sizeof(DimensionInfo) and
+	 * then we'll place the data.
+	 */
+	total_length = (sizeof(int32) + offsetof(MCVListData, items)
+					+ ndims * sizeof(DimensionInfo)
+					+ mcvlist->nitems * itemsize);
+
+	for (i = 0; i < ndims; i++)
+		total_length += info[i].nbytes;
+
+	/* enforce arbitrary limit of 1MB */
+	if (total_length > 1024 * 1024)
+		elog(ERROR, "serialized MCV exceeds 1MB (%ld)", total_length);
+
+	/* allocate space for the serialized MCV list, set header fields */
+	output = (bytea*)palloc0(total_length);
+	SET_VARSIZE(output, total_length);
+
+	/* we'll use 'ptr' to keep track of the place to write data */
+	data = VARDATA(output);
+
+	memcpy(data, mcvlist, offsetof(MCVListData, items));
+	data += offsetof(MCVListData, items);
+
+	memcpy(data, info, sizeof(DimensionInfo) * ndims);
+	data += sizeof(DimensionInfo) * ndims;
+
+	/* value array for each dimension */
+	for (i = 0; i < ndims; i++)
+	{
+#ifdef USE_ASSERT_CHECKING
+		char *tmp = data;
+#endif
+		for (j = 0; j < info[i].nvalues; j++)
+		{
+			if (info[i].typbyval)
+			{
+				/* passed by value / Datum */
+				memcpy(data, &values[i][j], sizeof(Datum));
+				data += sizeof(Datum);
+			}
+			else if (info[i].typlen > 0)
+			{
+				/* pased by reference, but fixed length (name, tid, ...) */
+				memcpy(data, &values[i][j], info[i].typlen);
+				data += info[i].typlen;
+			}
+			else if (info[i].typlen == -1)
+			{
+				/* varlena */
+				memcpy(data, DatumGetPointer(values[i][j]),
+							VARSIZE_ANY(values[i][j]));
+				data += VARSIZE_ANY(values[i][j]);
+			}
+			else if (info[i].typlen == -2)
+			{
+				/* cstring (don't forget the \0 terminator!) */
+				memcpy(data, DatumGetPointer(values[i][j]),
+							strlen(DatumGetPointer(values[i][j])) + 1);
+				data += strlen(DatumGetPointer(values[i][j])) + 1;
+			}
+		}
+		Assert((data - tmp) == info[i].nbytes);
+	}
+
+	/* and finally, the MCV items */
+	for (i = 0; i < mcvlist->nitems; i++)
+	{
+		/* don't write beyond the allocated space */
+		Assert(data <= (char*)output + total_length - itemsize);
+
+		/* reset the values for each item */
+		memset(item, 0, itemsize);
+
+		for (j = 0; j < ndims; j++)
+		{
+			/* do the lookup only for non-NULL values */
+			if (! mcvlist->items[i]->isnull[j])
+			{
+				Datum * v = NULL;
+				ssup_private = &ssup[j];
+
+				v = (Datum*)bsearch(&mcvlist->items[i]->values[j],
+								values[j], info[j].nvalues, sizeof(Datum),
+								bsearch_comparator);
+
+				if (v == NULL)
+					elog(ERROR, "value for dim %d not found in array", j);
+
+				/* compute index within the array */
+				ITEM_INDEXES(item)[j] = (v - values[j]);
+
+				/* check the index is within expected bounds */
+				Assert(ITEM_INDEXES(item)[j] >= 0);
+				Assert(ITEM_INDEXES(item)[j] < info[j].nvalues);
+			}
+		}
+
+		/* copy NULL and frequency flags into the item */
+		memcpy(ITEM_NULLS(item, ndims),
+				mcvlist->items[i]->isnull, sizeof(bool) * ndims);
+		memcpy(ITEM_FREQUENCY(item, ndims),
+				&mcvlist->items[i]->frequency, sizeof(double));
+
+		/* copy the item into the array */
+		memcpy(data, item, itemsize);
+
+		data += itemsize;
+	}
+
+	/* at this point we expect to match the total_length exactly */
+	Assert((data - (char*)output) == total_length);
+
+	return output;
+}
+
+/* inverse to serialize_mv_mcvlist() - see the comment there */
+MCVList deserialize_mv_mcvlist(bytea * data)
+{
+	int		i, j;
+	Size	expected_size;
+	MCVList mcvlist;
+	char   *tmp;
+
+	int		ndims, nitems, itemsize;
+	DimensionInfo *info = NULL;
+
+	int32  *indexes = NULL;
+	Datum **values = NULL;
+
+	if (data == NULL)
+		return NULL;
+
+	if (VARSIZE_ANY_EXHDR(data) < offsetof(MCVListData,items))
+		elog(ERROR, "invalid MCV Size %ld (expected at least %ld)",
+			 VARSIZE_ANY_EXHDR(data), offsetof(MCVListData,items));
+
+	/* read the MCV list header */
+	mcvlist = (MCVList)palloc0(sizeof(MCVListData));
+
+	/* initialize pointer to the data part (skip the varlena header) */
+	tmp = VARDATA(data);
+
+	/* get the header and perform basic sanity checks */
+	memcpy(mcvlist, tmp, offsetof(MCVListData,items));
+	tmp += offsetof(MCVListData,items);
+
+	if (mcvlist->magic != MVSTAT_MCV_MAGIC)
+		elog(ERROR, "invalid MCV magic %d (expected %dd)",
+			 mcvlist->magic, MVSTAT_MCV_MAGIC);
+
+	if (mcvlist->type != MVSTAT_MCV_TYPE_BASIC)
+		elog(ERROR, "invalid MCV type %d (expected %dd)",
+			 mcvlist->type, MVSTAT_MCV_TYPE_BASIC);
+
+	nitems = mcvlist->nitems;
+	ndims = mcvlist->ndimensions;
+	itemsize = ITEM_SIZE(ndims);
+
+	Assert(nitems > 0);
+	Assert((ndims >= 2) && (ndims <= MVSTATS_MAX_DIMENSIONS));
+
+	/*
+	 * What size do we expect with those parameters (it's incomplete,
+	 * as we yet have to count the array sizes (from DimensionInfo
+	 * records).
+	 */
+	expected_size = offsetof(MCVListData,items) +
+					ndims * sizeof(DimensionInfo) +
+					(nitems * itemsize);
+
+	/* check that we have at least the DimensionInfo records */
+	if (VARSIZE_ANY_EXHDR(data) < expected_size)
+		elog(ERROR, "invalid MCV Size %ld (expected %ld)",
+			 VARSIZE_ANY_EXHDR(data), expected_size);
+
+	info = (DimensionInfo*)(tmp);
+	tmp += ndims * sizeof(DimensionInfo);
+
+	/* account for the value arrays */
+	for (i = 0; i < ndims; i++)
+		expected_size += info[i].nbytes;
+
+	if (VARSIZE_ANY_EXHDR(data) != expected_size)
+		elog(ERROR, "invalid MCV Size %ld (expected %ld)",
+			 VARSIZE_ANY_EXHDR(data), expected_size);
+
+	/* looks OK - not corrupted or something */
+
+	/* let's parse the value arrays */
+	values = (Datum**)palloc0(sizeof(Datum*) * ndims);
+
+	/*
+	 * FIXME This uses pointers to the original data array (the types
+	 *       not passed by value), so when someone frees the memory,
+	 *       e.g. by doing something like this:
+	 *
+	 *           bytea * data = ... fetch the data from catalog ...
+	 *           MCVList mcvlist = deserialize_mcv_list(data);
+	 *           pfree(data);
+	 * 
+	 *       then 'mcvlist' references the freed memory. This needs to
+	 *       copy the pieces.
+	 */
+	for (i = 0; i < ndims; i++)
+	{
+		if (info[i].typbyval)
+		{
+			/* passed by value / Datum - simply reuse the array */
+			values[i] = (Datum*)tmp;
+			tmp += info[i].nbytes;
+		}
+		else if (info[i].typlen > 0)
+		{
+			/* pased by reference, but fixed length (name, tid, ...) */
+			values[i] = (Datum*)palloc0(sizeof(Datum) * info[i].nvalues);
+			for (j = 0; j < info[i].nvalues; j++)
+			{
+				/* just point into the array */
+				values[i][j] = PointerGetDatum(tmp);
+				tmp += info[i].typlen;
+			}
+		}
+		else if (info[i].typlen == -1)
+		{
+			/* varlena */
+			values[i] = (Datum*)palloc0(sizeof(Datum) * info[i].nvalues);
+			for (j = 0; j < info[i].nvalues; j++)
+			{
+				/* just point into the array */
+				values[i][j] = PointerGetDatum(tmp);
+				tmp += VARSIZE_ANY(tmp);
+			}
+		}
+		else if (info[i].typlen == -2)
+		{
+			/* cstring */
+			values[i] = (Datum*)palloc0(sizeof(Datum) * info[i].nvalues);
+			for (j = 0; j < info[i].nvalues; j++)
+			{
+				/* just point into the array */
+				values[i][j] = PointerGetDatum(tmp);
+				tmp += (strlen(tmp) + 1); /* don't forget the \0 */
+			}
+		}
+	}
+
+	/* allocate space for the MCV items */
+	mcvlist->items = (MCVItem*)palloc0(sizeof(MCVItem) * nitems);
+
+	for (i = 0; i < nitems; i++)
+	{
+		MCVItem item = (MCVItem)palloc0(sizeof(MCVItemData));
+
+		item->values = (Datum*)palloc0(sizeof(Datum)*ndims);
+		item->isnull = (bool*) palloc0(sizeof(bool) *ndims);
+
+		/* just point to the right place */
+		indexes = ITEM_INDEXES(tmp);
+
+		memcpy(item->isnull, ITEM_NULLS(tmp, ndims), sizeof(bool) * ndims);
+		memcpy(&item->frequency, ITEM_FREQUENCY(tmp, ndims), sizeof(double));
+
+		/* translate the values */
+		for (j = 0; j < ndims; j++)
+			if (! item->isnull[j])
+				item->values[j] = values[j][indexes[j]];
+
+		mcvlist->items[i] = item;
+
+		tmp += ITEM_SIZE(ndims);
+
+		Assert(tmp <= (char*)data + VARSIZE_ANY(data));
+	}
+
+	/* check that we processed all the data */
+	Assert(tmp == (char*)data + VARSIZE_ANY(data));
+
+	return mcvlist;
+}
+
+/*
+ * We need to pass the SortSupport to the comparator, but bsearch()
+ * has no 'context' parameter, so we use a global variable (ugly).
+ */
+static int
+bsearch_comparator(const void * a, const void * b)
+{
+	Assert(ssup_private != NULL);
+	return compare_scalars_simple(a, b, (void*)ssup_private);
+}
diff --git a/src/include/catalog/pg_mv_statistic.h b/src/include/catalog/pg_mv_statistic.h
index 76b7db7..f88e200 100644
--- a/src/include/catalog/pg_mv_statistic.h
+++ b/src/include/catalog/pg_mv_statistic.h
@@ -35,15 +35,21 @@ CATALOG(pg_mv_statistic,3281)
 
 	/* statistics requested to build */
 	bool		deps_enabled;		/* analyze dependencies? */
+	bool		mcv_enabled;		/* build MCV list? */
+
+	/* MCV size */
+	int32		mcv_max_items;		/* max MCV items */
 
 	/* statistics that are available (if requested) */
 	bool		deps_built;			/* dependencies were built */
+	bool		mcv_built;			/* MCV list was built */
 
 	/* variable-length fields start here, but we allow direct access to stakeys */
 	int2vector	stakeys;			/* array of column keys */
 
 #ifdef CATALOG_VARLEN
 	bytea		stadeps;			/* dependencies (serialized) */
+	bytea		stamcv;				/* MCV list (serialized) */
 #endif
 
 } FormData_pg_mv_statistic;
@@ -59,11 +65,15 @@ typedef FormData_pg_mv_statistic *Form_pg_mv_statistic;
  *		compiler constants for pg_attrdef
  * ----------------
  */
-#define Natts_pg_mv_statistic					5
+#define Natts_pg_mv_statistic					9
 #define Anum_pg_mv_statistic_starelid			1
 #define Anum_pg_mv_statistic_deps_enabled		2
-#define Anum_pg_mv_statistic_deps_built			3
-#define Anum_pg_mv_statistic_stakeys			4
-#define Anum_pg_mv_statistic_stadeps			5
+#define Anum_pg_mv_statistic_mcv_enabled		3
+#define Anum_pg_mv_statistic_mcv_max_items		4
+#define Anum_pg_mv_statistic_deps_built			5
+#define Anum_pg_mv_statistic_mcv_built			6
+#define Anum_pg_mv_statistic_stakeys			7
+#define Anum_pg_mv_statistic_stadeps			8
+#define Anum_pg_mv_statistic_stamcv				9
 
 #endif   /* PG_MV_STATISTIC_H */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 9fb118a..b4e7b4f 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2687,6 +2687,8 @@ DATA(insert OID = 3284 (  pg_mv_stats_dependencies_info     PGNSP PGUID 12 1 0 0
 DESCR("multivariate stats: functional dependencies info");
 DATA(insert OID = 3285 (  pg_mv_stats_dependencies_show     PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 25 "17" _null_ _null_ _null_ _null_ pg_mv_stats_dependencies_show _null_ _null_ _null_ ));
 DESCR("multivariate stats: functional dependencies show");
+DATA(insert OID = 3283 (  pg_mv_stats_mcvlist_info	PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 25 "17" _null_ _null_ _null_ _null_ pg_mv_stats_mcvlist_info _null_ _null_ _null_ ));
+DESCR("multi-variate statistics: MCV list info");
 
 DATA(insert OID = 1928 (  pg_stat_get_numscans			PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_numscans _null_ _null_ _null_ ));
 DESCR("statistics: number of scans done for table/index");
diff --git a/src/include/utils/mvstats.h b/src/include/utils/mvstats.h
index a074253..e11aefc 100644
--- a/src/include/utils/mvstats.h
+++ b/src/include/utils/mvstats.h
@@ -25,9 +25,11 @@ typedef struct MVStatsData {
 
 	/* statistics requested in ALTER TABLE ... ADD STATISTICS */
 	bool		deps_enabled;	/* analyze functional dependencies */
+	bool		mcv_enabled;	/* analyze MCV lists */
 
 	/* available statistics (computed by ANALYZE) */
 	bool		deps_built;	/* functional dependencies available */
+	bool		mcv_built;	/* MCV list is already available */
 } MVStatsData;
 
 typedef struct MVStatsData *MVStats;
@@ -66,6 +68,47 @@ typedef MVDependenciesData* MVDependencies;
 #define MVSTAT_DEPS_TYPE_BASIC	1			/* basic dependencies type */
 
 /*
+ * Multivariate MCV (most-common value) lists
+ *
+ * A straight-forward extension of MCV items - i.e. a list (array) of
+ * combinations of attribute values, together with a frequency and
+ * null flags.
+ */
+typedef struct MCVItemData {
+	double		frequency;	/* frequency of this combination */
+	bool	   *isnull;		/* lags of NULL values (up to 32 columns) */
+	Datum	   *values;		/* variable-length (ndimensions) */
+} MCVItemData;
+
+typedef MCVItemData *MCVItem;
+
+/* multivariate MCV list - essentally an array of MCV items */ 
+typedef struct MCVListData {
+	uint32		magic;			/* magic constant marker */
+	uint32		type;			/* type of MCV list (BASIC) */
+	uint32		ndimensions;	/* number of dimensions */
+	uint32		nitems;			/* number of MCV items in the array */
+	MCVItem	   *items;			/* array of MCV items */
+} MCVListData;
+
+typedef MCVListData *MCVList;
+
+/* used to flag stats serialized to bytea */
+#define MVSTAT_MCV_MAGIC		0xE1A651C2	/* marks serialized bytea */
+#define MVSTAT_MCV_TYPE_BASIC	1			/* basic MCV list type */
+
+/*
+ * Limits used for mcv_max_items option, i.e. we're always guaranteed
+ * to have space for at least MVSTAT_MCVLIST_MIN_ITEMS, and we cannot
+ * have more than MVSTAT_MCVLIST_MAX_ITEMS items.
+ *
+ * This is just a boundary for the 'max' threshold - the actual list
+ * may of course contain less items than MVSTAT_MCVLIST_MIN_ITEMS.
+ */
+#define MVSTAT_MCVLIST_MIN_ITEMS	128		/* min items in MCV list */
+#define MVSTAT_MCVLIST_MAX_ITEMS	8192	/* max items in MCV list */
+
+/*
  * TODO Maybe fetching the histogram/MCV list separately is inefficient?
  *      Consider adding a single `fetch_stats` method, fetching all
  *      stats specified using flags (or something like that).
@@ -74,24 +117,39 @@ MVStats list_mv_stats(Oid relid, int *nstats, bool built_only);
 bytea * fetch_mv_rules(Oid mvoid);
 
 bytea * fetch_mv_dependencies(Oid mvoid);
+bytea * fetch_mv_mcvlist(Oid mvoid);
 
 bytea * serialize_mv_dependencies(MVDependencies dependencies);
+bytea * serialize_mv_mcvlist(MCVList mcvlist, int2vector *attrs,
+							 VacAttrStats **stats);
 
 /* deserialization of stats (serialization is private to analyze) */
 MVDependencies	deserialize_mv_dependencies(bytea * data);
+MCVList			deserialize_mv_mcvlist(bytea * data);
+
+/*
+ * Returns index of the attribute number within the vector (i.e. a
+ * dimension within the stats).
+ */
+int mv_get_index(AttrNumber varattno, int2vector * stakeys);
 
 /* FIXME this probably belongs somewhere else (not to operations stats) */
 extern Datum pg_mv_stats_dependencies_info(PG_FUNCTION_ARGS);
 extern Datum pg_mv_stats_dependencies_show(PG_FUNCTION_ARGS);
+extern Datum pg_mv_stats_mcvlist_info(PG_FUNCTION_ARGS);
 
 MVDependencies
-build_mv_dependencies(int numrows, HeapTuple *rows,
-								  int2vector *attrs,
-								  int natts, VacAttrStats **vacattrstats);
+build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
+					  VacAttrStats **stats);
+
+MCVList
+build_mv_mcvlist(int numrows, HeapTuple *rows, int2vector *attrs,
+				 VacAttrStats **stats, int *numrows_filtered);
 
 void build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
-						   int natts, VacAttrStats **vacattrstats);
+					int natts, VacAttrStats **vacattrstats);
 
-void update_mv_stats(Oid relid, MVDependencies dependencies);
+void update_mv_stats(Oid relid, MVDependencies dependencies, MCVList mcvlist,
+					 int2vector *attrs, VacAttrStats **stats);
 
 #endif
diff --git a/src/test/regress/expected/mv_mcv.out b/src/test/regress/expected/mv_mcv.out
new file mode 100644
index 0000000..fa298ea
--- /dev/null
+++ b/src/test/regress/expected/mv_mcv.out
@@ -0,0 +1,210 @@
+-- data type passed by value
+CREATE TABLE mcv_list (
+    a INT,
+    b INT,
+    c INT
+);
+-- unknown column
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (unknown_column);
+ERROR:  column "unknown_column" referenced in statistics does not exist
+-- single column
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a);
+ERROR:  multivariate stats require 2 or more columns
+-- single column, duplicated
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, a);
+ERROR:  duplicate column name in statistics definition
+-- two columns, one duplicated
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, a, b);
+ERROR:  duplicate column name in statistics definition
+-- unknown option
+ALTER TABLE mcv_list ADD STATISTICS (unknown_option) ON (a, b, c);
+ERROR:  unrecognized STATISTICS option "unknown_option"
+-- missing MCV statistics
+ALTER TABLE mcv_list ADD STATISTICS (dependencies, max_mcv_items 200) ON (a, b, c);
+ERROR:  option 'mcv' is required by other options(s)
+-- invalid mcv_max_items value / too low
+ALTER TABLE mcv_list ADD STATISTICS (mcv, max_mcv_items 10) ON (a, b, c);
+ERROR:  max number of MCV items must be at least 128
+-- invalid mcv_max_items value / too high
+ALTER TABLE mcv_list ADD STATISTICS (mcv, max_mcv_items 10000) ON (a, b, c);
+ERROR:  max number of MCV items is 8192
+-- correct command
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c);
+-- random data
+INSERT INTO mcv_list
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | f         | 
+(1 row)
+
+TRUNCATE mcv_list;
+-- a => b, a => c, b => c
+INSERT INTO mcv_list
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=1000
+(1 row)
+
+TRUNCATE mcv_list;
+-- a => b, a => c
+INSERT INTO mcv_list
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=1000
+(1 row)
+
+TRUNCATE mcv_list;
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO mcv_list 
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX mcv_idx ON mcv_list (a, b);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=100
+(1 row)
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a = 10 AND b = 5;
+                 QUERY PLAN                 
+--------------------------------------------
+ Bitmap Heap Scan on mcv_list
+   Recheck Cond: ((a = 10) AND (b = 5))
+   ->  Bitmap Index Scan on mcv_idx
+         Index Cond: ((a = 10) AND (b = 5))
+(4 rows)
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
+-- varlena type (text)
+CREATE TABLE mcv_list (
+    a TEXT,
+    b TEXT,
+    c TEXT
+);
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c);
+-- random data
+INSERT INTO mcv_list
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | f         | 
+(1 row)
+
+TRUNCATE mcv_list;
+-- a => b, a => c, b => c
+INSERT INTO mcv_list
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=1000
+(1 row)
+
+TRUNCATE mcv_list;
+-- a => b, a => c
+INSERT INTO mcv_list
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=1000
+(1 row)
+
+TRUNCATE mcv_list;
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO mcv_list 
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX mcv_idx ON mcv_list (a, b);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=100
+(1 row)
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a = '10' AND b = '5';
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Bitmap Heap Scan on mcv_list
+   Recheck Cond: ((a = '10'::text) AND (b = '5'::text))
+   ->  Bitmap Index Scan on mcv_idx
+         Index Cond: ((a = '10'::text) AND (b = '5'::text))
+(4 rows)
+
+TRUNCATE mcv_list;
+-- check explain (expect bitmap index scan, not plain index scan) with NULLs
+INSERT INTO mcv_list 
+     SELECT
+       (CASE WHEN i/10000 = 0 THEN NULL ELSE i/10000 END),
+       (CASE WHEN i/20000 = 0 THEN NULL ELSE i/20000 END),
+       (CASE WHEN i/40000 = 0 THEN NULL ELSE i/40000 END)
+     FROM generate_series(1,1000000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=100
+(1 row)
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a IS NULL AND b IS NULL;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Bitmap Heap Scan on mcv_list
+   Recheck Cond: ((a IS NULL) AND (b IS NULL))
+   ->  Bitmap Index Scan on mcv_idx
+         Index Cond: ((a IS NULL) AND (b IS NULL))
+(4 rows)
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
+-- NULL values (mix of int and text columns)
+CREATE TABLE mcv_list (
+    a INT,
+    b TEXT,
+    c INT,
+    d TEXT
+);
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c, d);
+INSERT INTO mcv_list
+     SELECT
+         mod(i, 100),
+         (CASE WHEN mod(i, 200) = 0 THEN NULL ELSE mod(i,200) END),
+         mod(i, 400),
+         (CASE WHEN mod(i, 300) = 0 THEN NULL ELSE mod(i,600) END)
+     FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=1200
+(1 row)
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 82c2659..80375b8 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1357,7 +1357,9 @@ pg_mv_stats| SELECT n.nspname AS schemaname,
     c.relname AS tablename,
     s.stakeys AS attnums,
     length(s.stadeps) AS depsbytes,
-    pg_mv_stats_dependencies_info(s.stadeps) AS depsinfo
+    pg_mv_stats_dependencies_info(s.stadeps) AS depsinfo,
+    length(s.stamcv) AS mcvbytes,
+    pg_mv_stats_mcvlist_info(s.stamcv) AS mcvinfo
    FROM ((pg_mv_statistic s
      JOIN pg_class c ON ((c.oid = s.starelid)))
      LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)));
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index c41762c..78c9b04 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -111,4 +111,4 @@ test: event_trigger
 test: stats
 
 # run tests of multivariate stats
-test: mv_dependencies
+test: mv_dependencies mv_mcv
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 3845b0f..3f9884f 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -153,3 +153,4 @@ test: xml
 test: event_trigger
 test: stats
 test: mv_dependencies
+test: mv_mcv
diff --git a/src/test/regress/sql/mv_mcv.sql b/src/test/regress/sql/mv_mcv.sql
new file mode 100644
index 0000000..090731e
--- /dev/null
+++ b/src/test/regress/sql/mv_mcv.sql
@@ -0,0 +1,181 @@
+-- data type passed by value
+CREATE TABLE mcv_list (
+    a INT,
+    b INT,
+    c INT
+);
+
+-- unknown column
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (unknown_column);
+
+-- single column
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a);
+
+-- single column, duplicated
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, a);
+
+-- two columns, one duplicated
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, a, b);
+
+-- unknown option
+ALTER TABLE mcv_list ADD STATISTICS (unknown_option) ON (a, b, c);
+
+-- missing MCV statistics
+ALTER TABLE mcv_list ADD STATISTICS (dependencies, max_mcv_items 200) ON (a, b, c);
+
+-- invalid mcv_max_items value / too low
+ALTER TABLE mcv_list ADD STATISTICS (mcv, max_mcv_items 10) ON (a, b, c);
+
+-- invalid mcv_max_items value / too high
+ALTER TABLE mcv_list ADD STATISTICS (mcv, max_mcv_items 10000) ON (a, b, c);
+
+-- correct command
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c);
+
+-- random data
+INSERT INTO mcv_list
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- a => b, a => c, b => c
+INSERT INTO mcv_list
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- a => b, a => c
+INSERT INTO mcv_list
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO mcv_list 
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX mcv_idx ON mcv_list (a, b);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a = 10 AND b = 5;
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
+
+-- varlena type (text)
+CREATE TABLE mcv_list (
+    a TEXT,
+    b TEXT,
+    c TEXT
+);
+
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c);
+
+-- random data
+INSERT INTO mcv_list
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- a => b, a => c, b => c
+INSERT INTO mcv_list
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- a => b, a => c
+INSERT INTO mcv_list
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO mcv_list 
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX mcv_idx ON mcv_list (a, b);
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a = '10' AND b = '5';
+
+TRUNCATE mcv_list;
+
+-- check explain (expect bitmap index scan, not plain index scan) with NULLs
+INSERT INTO mcv_list 
+     SELECT
+       (CASE WHEN i/10000 = 0 THEN NULL ELSE i/10000 END),
+       (CASE WHEN i/20000 = 0 THEN NULL ELSE i/20000 END),
+       (CASE WHEN i/40000 = 0 THEN NULL ELSE i/40000 END)
+     FROM generate_series(1,1000000) s(i);
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a IS NULL AND b IS NULL;
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
+
+-- NULL values (mix of int and text columns)
+CREATE TABLE mcv_list (
+    a INT,
+    b TEXT,
+    c INT,
+    d TEXT
+);
+
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c, d);
+
+INSERT INTO mcv_list
+     SELECT
+         mod(i, 100),
+         (CASE WHEN mod(i, 200) = 0 THEN NULL ELSE mod(i,200) END),
+         mod(i, 400),
+         (CASE WHEN mod(i, 300) = 0 THEN NULL ELSE mod(i,600) END)
+     FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
-- 
2.0.5