>From 13c3d4cbe85bbbe6b9509de15dd08384df1df97f Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tv@fuzzy.cz>
Date: Sun, 11 Jan 2015 20:15:37 +0100
Subject: [PATCH 3/5] multivariate MCV lists

- extends the pg_mv_statistic catalog (add 'mcv' fields)
- building the MCV lists during ANALYZE
- simple estimation while planning the queries

Includes regression tests, mostly equal to regression tests for
functional dependencies.
---
 src/backend/catalog/system_views.sql   |    4 +-
 src/backend/commands/tablecmds.c       |   47 +-
 src/backend/optimizer/path/clausesel.c | 1153 ++++++++++++++++++++++++++++++--
 src/backend/utils/mvstats/Makefile     |    2 +-
 src/backend/utils/mvstats/common.c     |   58 +-
 src/backend/utils/mvstats/common.h     |   11 +-
 src/backend/utils/mvstats/mcv.c        | 1002 +++++++++++++++++++++++++++
 src/include/catalog/pg_mv_statistic.h  |   18 +-
 src/include/catalog/pg_proc.h          |    2 +
 src/include/utils/mvstats.h            |   68 +-
 src/test/regress/expected/mv_mcv.out   |  210 ++++++
 src/test/regress/expected/rules.out    |    4 +-
 src/test/regress/parallel_schedule     |    2 +-
 src/test/regress/serial_schedule       |    1 +
 src/test/regress/sql/mv_mcv.sql        |  181 +++++
 15 files changed, 2662 insertions(+), 101 deletions(-)
 create mode 100644 src/backend/utils/mvstats/mcv.c
 create mode 100644 src/test/regress/expected/mv_mcv.out
 create mode 100644 src/test/regress/sql/mv_mcv.sql

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index d05a716..4538e63 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -156,7 +156,9 @@ CREATE VIEW pg_mv_stats AS
         C.relname AS tablename,
         S.stakeys AS attnums,
         length(S.stadeps) as depsbytes,
-        pg_mv_stats_dependencies_info(S.stadeps) as depsinfo
+        pg_mv_stats_dependencies_info(S.stadeps) as depsinfo,
+        length(S.stamcv) AS mcvbytes,
+        pg_mv_stats_mcvlist_info(S.stamcv) AS mcvinfo
     FROM (pg_mv_statistic S JOIN pg_class C ON (C.oid = S.starelid))
         LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace);
 
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 965d342..fae0fc7 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -11901,7 +11901,13 @@ static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
 	Relation	mvstatrel;
 
 	/* by default build everything */
-	bool 	build_dependencies = false;
+	bool 	build_dependencies = false,
+			build_mcv = false;
+
+	int32 	max_mcv_items = -1;
+
+	/* options required because of other options */
+	bool	require_mcv = false;
 
 	Assert(IsA(def, StatisticsDef));
 
@@ -11956,6 +11962,29 @@ static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
 
 		if (strcmp(opt->defname, "dependencies") == 0)
 			build_dependencies = defGetBoolean(opt);
+		else if (strcmp(opt->defname, "mcv") == 0)
+			build_mcv = defGetBoolean(opt);
+		else if (strcmp(opt->defname, "max_mcv_items") == 0)
+		{
+			max_mcv_items = defGetInt32(opt);
+
+			/* this option requires 'mcv' to be enabled */
+			require_mcv = true;
+
+			/* sanity check */
+			if (max_mcv_items < MVSTAT_MCVLIST_MIN_ITEMS)
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("max number of MCV items must be at least %d",
+								MVSTAT_MCVLIST_MIN_ITEMS)));
+
+			else if (max_mcv_items > MVSTAT_MCVLIST_MAX_ITEMS)
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("max number of MCV items is %d",
+								MVSTAT_MCVLIST_MAX_ITEMS)));
+
+		}
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
@@ -11964,10 +11993,16 @@ static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
 	}
 
 	/* check that at least some statistics were requested */
-	if (! build_dependencies)
+	if (! (build_dependencies || build_mcv))
 		ereport(ERROR,
 				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("no statistics type (dependencies) was requested")));
+				 errmsg("no statistics type (dependencies, mcv) was requested")));
+
+	/* now do some checking of the options */
+	if (require_mcv && (! build_mcv))
+		ereport(ERROR,
+				(errcode(ERRCODE_SYNTAX_ERROR),
+				 errmsg("option 'mcv' is required by other options(s)")));
 
 	/* sort the attnums and build int2vector */
 	qsort(attnums, numcols, sizeof(int16), compare_int16);
@@ -11983,9 +12018,13 @@ static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
 	values[Anum_pg_mv_statistic_starelid-1] = ObjectIdGetDatum(RelationGetRelid(rel));
 
 	values[Anum_pg_mv_statistic_stakeys -1] = PointerGetDatum(stakeys);
+
 	values[Anum_pg_mv_statistic_deps_enabled -1] = BoolGetDatum(build_dependencies);
+	values[Anum_pg_mv_statistic_mcv_enabled   -1] = BoolGetDatum(build_mcv);
+	values[Anum_pg_mv_statistic_mcv_max_items    -1] = Int32GetDatum(max_mcv_items);
 
-	nulls[Anum_pg_mv_statistic_stadeps -1] = true;
+	nulls[Anum_pg_mv_statistic_stadeps -1]  = true;
+	nulls[Anum_pg_mv_statistic_stamcv   -1] = true;
 
 	/* insert the tuple into pg_mv_statistic */
 	mvstatrel = heap_open(MvStatisticRelationId, RowExclusiveLock);
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index e742827..d24aedf 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -20,6 +20,7 @@
 #include "optimizer/cost.h"
 #include "optimizer/pathnode.h"
 #include "optimizer/plancat.h"
+#include "optimizer/var.h"
 #include "utils/fmgroids.h"
 #include "utils/lsyscache.h"
 #include "utils/selfuncs.h"
@@ -50,17 +51,46 @@ typedef struct RangeQueryClause
 static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
 			   bool varonleft, bool isLTsel, Selectivity s2);
 
+#define		MV_CLAUSE_TYPE_FDEP		0x01
+#define		MV_CLAUSE_TYPE_MCV		0x02
 
 static bool clause_is_mv_compatible(PlannerInfo *root, Node *clause, Oid varRelid,
-							 Oid *relid, AttrNumber *attnum, SpecialJoinInfo *sjinfo);
+							 Oid *relid, Bitmapset **attnums, SpecialJoinInfo *sjinfo,
+							 int type);
 
 static Bitmapset  *collect_mv_attnums(PlannerInfo *root, List *clauses,
-									  Oid varRelid, Oid *relid, SpecialJoinInfo *sjinfo);
+									  Oid varRelid, Oid *relid, SpecialJoinInfo *sjinfo,
+									  int type);
 
 static List *clauselist_apply_dependencies(PlannerInfo *root, List *clauses,
 								Oid varRelid, int nmvstats, MVStats mvstats,
 								SpecialJoinInfo *sjinfo);
 
+static int choose_mv_statistics(int nmvstats, MVStats mvstats,
+								Bitmapset *attnums);
+static List *clauselist_mv_split(PlannerInfo *root, SpecialJoinInfo *sjinfo,
+								 List *clauses, Oid varRelid,
+								 List **mvclauses, MVStats mvstats, int types);
+
+static Selectivity clauselist_mv_selectivity(PlannerInfo *root,
+						List *clauses, MVStats mvstats);
+static Selectivity clauselist_mv_selectivity_mcvlist(PlannerInfo *root,
+									List *clauses, MVStats mvstats,
+									bool *fullmatch, Selectivity *lowsel);
+
+static int update_match_bitmap_mcvlist(PlannerInfo *root, List *clauses,
+									int2vector *stakeys, MCVList mcvlist,
+									int nmatches, char * matches,
+									Selectivity *lowsel, bool *fullmatch,
+									bool is_or);
+
+/* used for merging bitmaps - AND (min), OR (max) */
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+#define UPDATE_RESULT(m,r,or)	\
+	(m) = (or) ? (MAX(m,r)) : (MIN(m,r))
+
 /****************************************************************************
  *		ROUTINES TO COMPUTE SELECTIVITIES
  ****************************************************************************/
@@ -197,15 +227,19 @@ clauselist_selectivity(PlannerInfo *root,
 	Bitmapset  *mvattnums = NULL;
 
 	/*
-	 * If there's exactly one clause, then no use in trying to match up pairs,
-	 * so just go directly to clause_selectivity().
+	 * If there's exactly one clause, then no use in trying to match up
+	 * pairs, so just go directly to clause_selectivity().
 	 */
 	if (list_length(clauses) == 1)
 		return clause_selectivity(root, (Node *) linitial(clauses),
 								  varRelid, jointype, sjinfo);
 
-	/* collect attributes referenced by mv-compatible clauses */
-	mvattnums = collect_mv_attnums(root, clauses, varRelid, &relid, sjinfo);
+	/*
+	 * Collect attributes referenced by mv-compatible clauses (looking
+	 * for clauses compatible with functional dependencies for now).
+	 */
+	mvattnums = collect_mv_attnums(root, clauses, varRelid, &relid, sjinfo,
+								   MV_CLAUSE_TYPE_FDEP);
 
 	/*
 	 * If there are mv-compatible clauses, referencing at least two
@@ -227,6 +261,49 @@ clauselist_selectivity(PlannerInfo *root,
 	}
 
 	/*
+	 * Recollect attributes from mv-compatible clauses (maybe we've
+	 * removed so many clauses we have a single mv-compatible attnum).
+	 * From now on we're only interested in MCV-compatible clauses.
+	 */
+	mvattnums = collect_mv_attnums(root, clauses, varRelid, &relid, sjinfo,
+								   MV_CLAUSE_TYPE_MCV);
+
+	/*
+	 * If there still are at least two columns, we'll try to select
+	 * a suitable multivariate stats.
+	 */
+	if (bms_num_members(mvattnums) >= 2)
+	{
+		/* fetch info from the catalog (not the serialized stats yet) */
+		mvstats = list_mv_stats(relid, &nmvstats, true);
+
+		/* see choose_mv_statistics() for details */
+		if (nmvstats > 0)
+		{
+			int idx = choose_mv_statistics(nmvstats, mvstats, mvattnums);
+
+			if (idx >= 0)	/* we have a matching stats */
+			{
+				MVStats mvstat = &mvstats[idx];
+
+				/* clauses compatible with multi-variate stats */
+				List	*mvclauses = NIL;
+
+				/* split the clauselist into regular and mv-clauses */
+				clauses = clauselist_mv_split(root, sjinfo, clauses,
+										varRelid, &mvclauses, mvstat,
+										MV_CLAUSE_TYPE_MCV);
+
+				/* we've chosen the histogram to match the clauses */
+				Assert(mvclauses != NIL);
+
+				/* compute the multivariate stats */
+				s1 *= clauselist_mv_selectivity(root, mvclauses, mvstat);
+			}
+		}
+	}
+
+	/*
 	 * Initial scan over clauses.  Anything that doesn't look like a potential
 	 * rangequery clause gets multiplied into s1 and forgotten. Anything that
 	 * does gets inserted into an rqlist entry.
@@ -901,12 +978,198 @@ clause_selectivity(PlannerInfo *root,
 	return s1;
 }
 
+
+/*
+ * Estimate selectivity for the list of MV-compatible clauses, using that
+ * particular histogram.
+ *
+ * When we hit a single bucket, we don't know what portion of it actually
+ * matches the clauses (e.g. equality), and we use 1/2 the bucket by
+ * default. However, the MV histograms are usually less detailed than
+ * the per-column ones, meaning the sum of buckets is often quite high
+ * (thanks to combining a lot of "partially hit" buckets).
+ *
+ * There are several ways to improve this, usually with cases when it
+ * won't really help. Also, the more complex the process, the worse
+ * the failures (i.e. misestimates).
+ *
+ * (1) Use the MV histogram only as a way to combine multiple
+ *     per-column histograms, essentially rewriting
+ *
+ *       P(A & B) = P(A) * P(B|A)
+ *
+ *     where P(B|A) may be computed using a proper "slice" of the
+ *     histogram, by first selecting only buckets where A is true, and
+ *     then using the boundaries to 'restrict' the per-colunm histogram.
+ *
+ *     With more clauses, it gets more complicated, of course
+ *
+ *       P(A & B & C) = P(A & C) * P(B|A & C)
+ *                    = P(A) * P(C|A) * P(B|A & C)
+ *
+ *     and so on.
+ *
+ *     Of course, the question is how well and efficiently we can
+ *     compute the conditional probabilities - whether this approach
+ *     can improve the estimates (instead of amplifying the errors).
+ *
+ *     Also, this does not eliminate the need for histogram on [A,B,C].
+ *
+ * (2) Use multiple smaller (and more accurate) histograms, and combine
+ *     them using a process similar to the above. E.g. by assuming that
+ *     B and C are independent, we can rewrite
+ *
+ *       P(B|A & C) = P(B|A)
+ *
+ *     so we can rewrite the whole formula to
+ *
+ *       P(A & B & C) = P(A) * P(C|A) * P(B|A)
+ *
+ *     and we're OK with two 2D histograms [A,C] and [A,B].
+ *
+ *     It'd be nice to perform some sort of statistical test (Fisher
+ *     or another chi-squared test) to identify independent components
+ *     and automatically separate them into smaller histograms.
+ *
+ * (3) Using the estimated number of distinct values in a bucket to
+ *     decide the selectivity of equality in the bucket (instead of
+ *     blindly using 1/2 of the bucket, we may use 1/ndistinct).
+ *     Of course, if the ndistinct estimate is way off, or when the
+ *     distribution is not uniform (one distict items get much more
+ *     items), this will fail. Also, we currently don't have ndistinct
+ *     estimate available at this moment (but it shouldn't be that
+ *     difficult to compute as ndistinct and ntuples should be available).
+ *
+ * TODO Clamp the selectivity by min of the per-clause selectivities
+ *      (i.e. the selectivity of the most restrictive clause), because
+ *      that's the maximum we can ever get from ANDed list of clauses.
+ *      This may probably prevent issues with hitting too many buckets
+ *      and low precision histograms.
+ *
+ * TODO We may support some additional conditions, most importantly
+ *      those matching multiple columns (e.g. "a = b" or "a < b").
+ *      Ultimately we could track multi-table histograms for join
+ *      cardinality estimation.
+ *
+ * TODO Currently this is only estimating all clauses, or clauses
+ *      matching varRelid (when it's not 0). I'm not sure what's the
+ *      purpose of varRelid, but my assumption is this is used for
+ *      join conditions and such. In that case we can use those clauses
+ *      to restrict the other (i.e. filter the histogram buckets first,
+ *      before estimating the other clauses). This is essentially equal
+ *      to computing P(A|B) where "B" are the clauses not matching the
+ *      varRelid.
+ *
+ * TODO Further thoughts on processing equality clauses - maybe it'd be
+ *      better to look for stats (with MCV) covered by the equality
+ *      clauses, because then we have a chance to find an exact match
+ *      in the MCV list, which is pretty much the best we can do. We may
+ *      also look at the least frequent MCV item, and use it as a upper
+ *      boundary for the selectivity (had there been a more frequent
+ *      item, it'd be in the MCV list).
+ *
+ *      These conditions may then be used as a condition for the other
+ *      selectivities, i.e. we may estimate P(A,B) first, and then
+ *      compute P(C|A,B) from another histogram. This may be useful when
+ *      we can estimate P(A,B) accurately (e.g. because it's a complete
+ *      equality match evaluated on MCV list), and then compute the
+ *      conditional probability P(C|A,B), giving us the requested stats
+ *
+ *          P(A,B,C) = P(A,B) * P(C|A,B)
+ *
+ * TODO There are several options for 'sanity clamping' the estimates.
+ *
+ *      First, if we have selectivities for each condition, then
+ *
+ *          P(A,B) <= MIN(P(A), P(B))
+ *
+ *      Because additional conditions (connected by AND) can only lower
+ *      the probability.
+ *
+ *      So we can do some basic sanity checks using the single-variate
+ *      stats (the ones we have right now).
+ *
+ *      Second, when we have multivariate stats with a MCV list, then
+ *
+ *      (a) if we have a full equality condition (one equality condition
+ *          on each column) and we found a match in the MCV list, this is
+ *          the selectivity (and it's supposed to be exact)
+ *
+ *      (b) if we have a full equality condition and we haven't found a
+ *          match in the MCV list, then the selectivity is below the
+ *          lowest selectivity in the MCV list
+ *
+ *      (c) if we have a equality condition (not full), we can still
+ *          search the MCV for matches and use the sum of probabilities
+ *          as a lower boundary for the histogram (if there are no
+ *          matches in the MCV list, then we have no boundary)
+ *
+ *      Third, if there are multiple multivariate stats for a set of
+ *      clauses, we may compute all of them and then somehow aggregate
+ *      them - e.g. by choosing the minimum, median or average. The
+ *      multi-variate stats are susceptible to overestimation (because
+ *      we take 50% of the bucket for partial matches). Some stats may
+ *      give better estimates than others, but it's very difficult to
+ *      say determine that in advance which one is the best (it depends
+ *      on the number of buckets, number of additional columns not
+ *      referenced in the clauses etc.) so we may compute all and then
+ *      choose a sane aggregation (minimum seems like a good approach).
+ *      Of course, this may result in longer / more expensive estimation
+ *      (CPU-wise), but it may be worth it.
+ *
+ *      There are ways to address this, though. First, it's possible to
+ *      add a GUC choosing whether to do a 'simple' (using a single
+ *      stats expected to give the best estimate) and 'complex' (combining
+ *      the multiple estimates).
+ *
+ *          multivariate_estimates = (simple|full)
+ *
+ *      Also, this might be enabled at a table level, by something like
+ *
+ *          ALTER TABLE ... SET STATISTICS (simple|full)
+ *
+ *      Which would make it possible to use this only for the tables
+ *      where the simple approach does not work.
+ *
+ *      Also, there are ways to optimize this algorithmically. E.g. we
+ *      may try to get an estimate from a matching MCV list first, and
+ *      if we happen to get a "full equality match" we may stop computing
+ *      the estimates from other stats (for this condition) because
+ *      that's probably the best estimate we can really get.
+ *
+ * TODO When applying the clauses to the histogram/MCV list, we can do
+ *      that from the most selective clauses first, because that'll
+ *      eliminate the buckets/items sooner (so we'll be able to skip
+ *      them without inspection, which is more expensive).
+ */
+static Selectivity
+clauselist_mv_selectivity(PlannerInfo *root, List *clauses, MVStats mvstats)
+{
+	bool fullmatch = false;
+
+	/*
+	 * Lowest frequency in the MCV list (may be used as an upper bound
+	 * for full equality conditions that did not match any MCV item).
+	 */
+	Selectivity mcv_low = 0.0;
+
+	/* TODO Evaluate simple 1D selectivities, use the smallest one as
+	 *      an upper bound, product as lower bound, and sort the
+	 *      clauses in ascending order by selectivity (to optimize the
+	 *      MCV/histogram evaluation).
+	 */
+
+	/* Evaluate the MCV selectivity */
+	return clauselist_mv_selectivity_mcvlist(root, clauses, mvstats,
+										   &fullmatch, &mcv_low);
+}
+
 /*
  * Collect attributes from mv-compatible clauses.
  */
 static Bitmapset *
 collect_mv_attnums(PlannerInfo *root, List *clauses, Oid varRelid,
-				   Oid *relid, SpecialJoinInfo *sjinfo)
+				   Oid *relid, SpecialJoinInfo *sjinfo, int types)
 {
 	Bitmapset  *attnums = NULL;
 	ListCell   *l;
@@ -922,12 +1185,11 @@ collect_mv_attnums(PlannerInfo *root, List *clauses, Oid varRelid,
 	 */
 	foreach (l, clauses)
 	{
-		AttrNumber attnum;
 		Node	   *clause = (Node *) lfirst(l);
 
-		/* ignore the result for now - we only need the info */
-		if (clause_is_mv_compatible(root, clause, varRelid, relid, &attnum, sjinfo))
-			attnums = bms_add_member(attnums, attnum);
+		/* ignore the result here - we only need the attnums */
+		clause_is_mv_compatible(root, clause, varRelid, relid, &attnums,
+								sjinfo, types);
 	}
 
 	/*
@@ -946,6 +1208,180 @@ collect_mv_attnums(PlannerInfo *root, List *clauses, Oid varRelid,
 }
 
 /*
+ * We're looking for statistics matching at least 2 attributes,
+ * referenced in the clauses compatible with multivariate statistics.
+ * The current selection criteria is very simple - we choose the
+ * statistics referencing the most attributes.
+ *
+ * If there are multiple statistics referencing the same number of
+ * columns (from the clauses), the one with less source columns
+ * (as listed in the ADD STATISTICS when creating the statistics) wins.
+ * Other wise the first one wins.
+ *
+ * This is a very simple criteria, and has several weaknesses:
+ *
+ * (a) does not consider the accuracy of the statistics
+ *
+ *     If there are two histograms built on the same set of columns,
+ *     but one has 100 buckets and the other one has 1000 buckets (thus
+ *     likely providing better estimates), this is not currently
+ *     considered.
+ *
+ * (b) does not consider the type of statistics
+ *
+ *     If there are three statistics - one containing just a MCV list,
+ *     another one with just a histogram and a third one with both,
+ *     this is not considered.
+ *
+ * (c) does not consider the number of clauses
+ *
+ *     As explained, only the number of referenced attributes counts,
+ *     so if there are multiple clauses on a single attribute, this
+ *     still counts as a single attribute.
+ *
+ * (d) does not consider type of condition
+ *
+ *     Some clauses may work better with some statistics - for example
+ *     equality clauses probably work better with MCV lists than with
+ *     histograms. But IS [NOT] NULL conditions may often work better
+ *     with histograms (thanks to NULL-buckets).
+ *
+ * So for example with five WHERE conditions
+ *
+ *     WHERE (a = 1) AND (b = 1) AND (c = 1) AND (d = 1) AND (e = 1)
+ *
+ * and statistics on (a,b), (a,b,e) and (a,b,c,d), the last one will be
+ * selected as it references the most columns.
+ *
+ * Once we have selected the multivariate statistics, we split the list
+ * of clauses into two parts - conditions that are compatible with the
+ * selected stats, and conditions are estimated using simple statistics.
+ *
+ * From the example above, conditions
+ *
+ *     (a = 1) AND (b = 1) AND (c = 1) AND (d = 1)
+ *
+ * will be estimated using the multivariate statistics (a,b,c,d) while
+ * the last condition (e = 1) will get estimated using the regular ones.
+ *
+ * There are various alternative selection criteria (e.g. counting
+ * conditions instead of just referenced attributes), but eventually
+ * the best option should be to combine multiple statistics. But that's
+ * much harder to do correctly.
+ *
+ * TODO Select multiple statistics and combine them when computing
+ *      the estimate.
+ *
+ * TODO This will probably have to consider compatibility of clauses,
+ *      because 'dependencies' will probably work only with equality
+ *      clauses.
+ */
+static int
+choose_mv_statistics(int nmvstats, MVStats mvstats, Bitmapset *attnums)
+{
+	int i, j;
+
+	int choice = -1;
+	int current_matches = 1;					/* goal #1: maximize */
+	int current_dims = (MVSTATS_MAX_DIMENSIONS+1);	/* goal #2: minimize */
+
+	/*
+	 * Walk through the statistics (simple array with nmvstats elements)
+	 * and for each one count the referenced attributes (encoded in
+	 * the 'attnums' bitmap).
+	 */
+	for (i = 0; i < nmvstats; i++)
+	{
+		/* columns matching this statistics */
+		int matches = 0;
+
+		int2vector * attrs = mvstats[i].stakeys;
+		int	numattrs = mvstats[i].stakeys->dim1;
+
+		/* count columns covered by the histogram */
+		for (j = 0; j < numattrs; j++)
+			if (bms_is_member(attrs->values[j], attnums))
+				matches++;
+
+		/*
+		 * Use this statistics when it improves the number of matches or
+		 * when it matches the same number of attributes but is smaller.
+		 */
+		if ((matches > current_matches) ||
+			((matches == current_matches) && (current_dims > numattrs)))
+		{
+			choice = i;
+			current_matches = matches;
+			current_dims = numattrs;
+		}
+	}
+
+	return choice;
+}
+
+
+/*
+ * This splits the clauses list into two parts - one containing clauses
+ * that will be evaluated using the chosen statistics, and the remaining
+ * clauses (either non-mvcompatible, or not related to the histogram).
+ */
+static List *
+clauselist_mv_split(PlannerInfo *root, SpecialJoinInfo *sjinfo,
+					List *clauses, Oid varRelid, List **mvclauses,
+					MVStats mvstats, int types)
+{
+	int i;
+	ListCell *l;
+	List	 *non_mvclauses = NIL;
+
+	/* FIXME is there a better way to get info on int2vector? */
+	int2vector * attrs = mvstats->stakeys;
+	int	numattrs = mvstats->stakeys->dim1;
+
+	Bitmapset *mvattnums = NULL;
+
+	/* build bitmap of attributes covered by the stats, so we can
+	 * do bms_is_subset later */
+	for (i = 0; i < numattrs; i++)
+		mvattnums = bms_add_member(mvattnums, attrs->values[i]);
+
+	/* erase the list of mv-compatible clauses */
+	*mvclauses = NIL;
+
+	foreach (l, clauses)
+	{
+		bool		match = false;	/* by default not mv-compatible */
+		Bitmapset	*attnums = NULL;
+		Node	   *clause = (Node *) lfirst(l);
+
+		if (clause_is_mv_compatible(root, clause, varRelid, NULL,
+									&attnums, sjinfo, types))
+		{
+			/* are all the attributes part of the selected stats? */
+			if (bms_is_subset(attnums, mvattnums))
+				match = true;
+		}
+
+		/*
+		 * The clause matches the selected stats, so put it to the list
+		 * of mv-compatible clauses. Otherwise, keep it in the list of
+		 * 'regular' clauses (that may be selected later).
+		 */
+		if (match)
+			*mvclauses = lappend(*mvclauses, clause);
+		else
+			non_mvclauses = lappend(non_mvclauses, clause);
+	}
+
+	/*
+	 * Perform regular estimation using the clauses incompatible
+	 * with the chosen histogram (or MV stats in general).
+	 */
+	return non_mvclauses;
+
+}
+
+/*
  * Determines whether the clause is compatible with multivariate stats,
  * and if it is, returns some additional information - varno (index
  * into simple_rte_array) and a bitmap of attributes. This is then
@@ -964,96 +1400,205 @@ collect_mv_attnums(PlannerInfo *root, List *clauses, Oid varRelid,
  */
 static bool
 clause_is_mv_compatible(PlannerInfo *root, Node *clause, Oid varRelid,
-						Oid *relid, AttrNumber *attnum, SpecialJoinInfo *sjinfo)
+						Oid *relid, Bitmapset **attnums, SpecialJoinInfo *sjinfo,
+						int types)
 {
+	Relids clause_relids;
+	Relids left_relids;
+	Relids right_relids;
 
 	if (IsA(clause, RestrictInfo))
 	{
 		RestrictInfo *rinfo = (RestrictInfo *) clause;
 
-		/* Pseudoconstants are not really interesting here. */
-		if (rinfo->pseudoconstant)
+		if (! IsA(clause, RestrictInfo))
+		{
+			elog(WARNING, "expected RestrictInfo, got type %d", clause->type);
 			return false;
+		}
 
-		/* no support for OR clauses at this point */
-		if (rinfo->orclause)
+		/* Pseudoconstants are not really interesting here. */
+		if (rinfo->pseudoconstant)
 			return false;
 
 		/* get the actual clause from the RestrictInfo (it's not an OR clause) */
 		clause = (Node*)rinfo->clause;
 
-		/* only simple opclauses are compatible with multivariate stats */
-		if (! is_opclause(clause))
-			return false;
-
 		/* we don't support join conditions at this moment */
 		if (treat_as_join_clause(clause, rinfo, varRelid, sjinfo))
 			return false;
 
+		clause_relids = rinfo->clause_relids;
+		left_relids = rinfo->left_relids;
+		right_relids = rinfo->right_relids;
+	}
+	else if (is_opclause(clause) && list_length(((OpExpr *) clause)->args) == 2)
+	{
+		left_relids = pull_varnos(get_leftop((Expr*)clause));
+		right_relids = pull_varnos(get_rightop((Expr*)clause));
+
+		clause_relids = bms_union(left_relids,
+								  right_relids);
+	}
+	else
+	{
+		/* Not a binary opclause, so mark left/right relid sets as empty */
+		left_relids = NULL;
+		right_relids = NULL;
+		/* and get the total relid set the hard way */
+		clause_relids = pull_varnos((Node *) clause);
+	}
+
+	/*
+	 * Only simple opclauses and IS NULL tests are compatible with
+	 * multivariate stats at this point.
+	 */
+	if ((is_opclause(clause))
+		&& (list_length(((OpExpr *) clause)->args) == 2))
+	{
+		OpExpr	   *expr = (OpExpr *) clause;
+		bool		varonleft = true;
+		bool		ok;
+
 		/* is it 'variable op constant' ? */
-		if (list_length(((OpExpr *) clause)->args) == 2)
-		{
-			OpExpr	   *expr = (OpExpr *) clause;
-			bool		varonleft = true;
-			bool		ok;
 
-			ok = (bms_membership(rinfo->clause_relids) == BMS_SINGLETON) &&
-				(is_pseudo_constant_clause_relids(lsecond(expr->args),
-												rinfo->right_relids) ||
-				(varonleft = false,
-				is_pseudo_constant_clause_relids(linitial(expr->args),
-												rinfo->left_relids)));
+		ok = (bms_membership(clause_relids) == BMS_SINGLETON) &&
+			(is_pseudo_constant_clause_relids(lsecond(expr->args),
+											  right_relids) ||
+			(varonleft = false,
+			is_pseudo_constant_clause_relids(linitial(expr->args),
+											 left_relids)));
 
-			if (ok)
-			{
-				RangeTblEntry * rte;
-				Var * var = (varonleft) ? linitial(expr->args) : lsecond(expr->args);
+		if (ok)
+		{
+			RangeTblEntry * rte;
+			Var * var = (varonleft) ? linitial(expr->args) : lsecond(expr->args);
 
-				/*
-				 * Simple variables only - otherwise the planner_rt_fetch seems to fail
-				 * (return NULL).
-				 *
-				 * TODO Maybe use examine_variable() would fix that?
-				 */
-				if (! (IsA(var, Var) && (varRelid == 0 || varRelid == var->varno)))
-					return false;
+			/*
+			 * Simple variables only - otherwise the planner_rt_fetch seems to fail
+			 * (return NULL).
+			 *
+			 * TODO Maybe use examine_variable() would fix that?
+			 */
+			if (! (IsA(var, Var) && (varRelid == 0 || varRelid == var->varno)))
+				return false;
 
-				/*
-				 * Only consider this variable if (varRelid == 0) or when the varno
-				 * matches varRelid (see explanation at clause_selectivity).
-				 *
-				 * FIXME I suspect this may not be really necessary. The (varRelid == 0)
-				 *       part seems to be enforced by treat_as_join_clause().
-				 */
-				if (! ((varRelid == 0) || (varRelid == var->varno)))
-					return false;
+			/*
+			 * Only consider this variable if (varRelid == 0) or when the varno
+			 * matches varRelid (see explanation at clause_selectivity).
+			 *
+			 * FIXME I suspect this may not be really necessary. The (varRelid == 0)
+			 *       part seems to be enforced by treat_as_join_clause().
+			 */
+			if (! ((varRelid == 0) || (varRelid == var->varno)))
+				return false;
 
-				/* Also skip special varno values, and system attributes ... */
-				if ((IS_SPECIAL_VARNO(var->varno)) || (! AttrNumberIsForUserDefinedAttr(var->varattno)))
-					return false;
+			/* Also skip special varno values, and system attributes ... */
+			if ((IS_SPECIAL_VARNO(var->varno)) || (! AttrNumberIsForUserDefinedAttr(var->varattno)))
+				return false;
 
-				/* Lookup info about the base relation (we need to pass the OID out) */
+			/* Lookup info about the base relation (we need to pass the OID out) */
+			if (relid != NULL)
+			{
 				rte = planner_rt_fetch(var->varno, root);
 				*relid = rte->relid;
-
-				/*
-				 * If it's not a "<" or ">" or "=" operator, just ignore the
-				 * clause. Otherwise note the relid and attnum for the variable.
-				 * This uses the function for estimating selectivity, ont the
-				 * operator directly (a bit awkward, but well ...).
-				 */
-				switch (get_oprrest(expr->opno))
-					{
-						case F_EQSEL:
-							*attnum = var->varattno;
-							return true;
-					}
 			}
+
+			/*
+			 * If it's not a "<" or ">" or "=" operator, just ignore the
+			 * clause. Otherwise note the relid and attnum for the variable.
+			 * This uses the function for estimating selectivity, ont the
+			 * operator directly (a bit awkward, but well ...).
+			 */
+			switch (get_oprrest(expr->opno))
+				{
+					case F_SCALARLTSEL:
+					case F_SCALARGTSEL:
+						/* not compatible with functional dependencies */
+						if (types & MV_CLAUSE_TYPE_MCV)
+						{
+							*attnums = bms_add_member(*attnums, var->varattno);
+							return (types & MV_CLAUSE_TYPE_MCV);
+						}
+						return false;
+
+					case F_EQSEL:
+						*attnums = bms_add_member(*attnums, var->varattno);
+						return true;
+				}
 		}
 	}
+	else if (IsA(clause, NullTest)
+			 && IsA(((NullTest*)clause)->arg, Var))
+	{
+		RangeTblEntry * rte;
+		Var * var = (Var*)((NullTest*)clause)->arg;
 
-	return false;
+		/*
+		 * Simple variables only - otherwise the planner_rt_fetch seems to fail
+		 * (return NULL).
+		 *
+		 * TODO Maybe use examine_variable() would fix that?
+		 */
+		if (! (IsA(var, Var) && (varRelid == 0 || varRelid == var->varno)))
+			return false;
 
+		/*
+		 * Only consider this variable if (varRelid == 0) or when the varno
+		 * matches varRelid (see explanation at clause_selectivity).
+		 *
+		 * FIXME I suspect this may not be really necessary. The (varRelid == 0)
+		 *       part seems to be enforced by treat_as_join_clause().
+		 */
+		if (! ((varRelid == 0) || (varRelid == var->varno)))
+			return false;
+
+		/* Also skip special varno values, and system attributes ... */
+		if ((IS_SPECIAL_VARNO(var->varno)) || (! AttrNumberIsForUserDefinedAttr(var->varattno)))
+			return false;
+
+		/* Lookup info about the base relation (we need to pass the OID out) */
+		if (relid != NULL)
+		{
+			rte = planner_rt_fetch(var->varno, root);
+			*relid = rte->relid;
+		}
+
+		*attnums = bms_add_member(*attnums, var->varattno);
+
+		return true;
+	}
+	else if (or_clause(clause) || and_clause(clause))
+	{
+		/*
+		 * AND/OR-clauses are supported if all sub-clauses are supported
+		 *
+		 * TODO We might support mixed case, where some of the clauses
+		 *      are supported and some are not, and treat all supported
+		 *      subclauses as a single clause, compute it's selectivity
+		 *      using mv stats, and compute the total selectivity using
+		 *      the current algorithm.
+		 *
+		 * TODO For RestrictInfo above an OR-clause, we might use the
+		 *      orclause with nested RestrictInfo - we won't have to
+		 *      call pull_varnos() for each clause, saving time. 
+		 */
+		Bitmapset *tmp = NULL;
+		ListCell *l;
+		foreach (l, ((BoolExpr*)clause)->args)
+		{
+			if (! clause_is_mv_compatible(root, (Node*)lfirst(l),
+						varRelid, relid, &tmp, sjinfo, types))
+				return false;
+		}
+
+		/* add the attnums from the OR-clause to the set of attnums */
+		*attnums = bms_join(*attnums, tmp);
+
+		return true;
+	}
+
+	return false;
 }
 
 /*
@@ -1115,6 +1660,13 @@ clause_is_mv_compatible(PlannerInfo *root, Node *clause, Oid varRelid,
  *
  * TODO Merge this docs to dependencies.c, as it's saying mostly the
  *      same things as the comments there.
+ * 
+ * TODO Currently this is applied only to the top-level clauses, but
+ *      maybe we could apply it to lists at subtrees too, e.g. to the
+ *      two AND-clauses in
+ *
+ *          (x=1 AND y=2) OR (z=3 AND q=10)
+ *
  */
 static List *
 clauselist_apply_dependencies(PlannerInfo *root, List *clauses, Oid varRelid,
@@ -1195,17 +1747,27 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses, Oid varRelid,
 	 */
 	foreach (lc, clauses)
 	{
-		AttrNumber attnum;
+		Bitmapset *attnums = NULL;
 		Node	   *clause = (Node *) lfirst(lc);
-		if (! clause_is_mv_compatible(root, clause, varRelid, &relid, &attnum, sjinfo))
+		if (! clause_is_mv_compatible(root, clause, varRelid, &relid, &attnums,
+									  sjinfo, MV_CLAUSE_TYPE_FDEP))
+			reduced_clauses = lappend(reduced_clauses, clause);
+		else if (bms_num_members(attnums) > 1)
+			/* FIXME This may happen thanks to OR-clauses, which should
+			 *       really be handled differently for functional
+			 *       dependencies.
+			 */
 			reduced_clauses = lappend(reduced_clauses, clause);
 		else
 		{
+			/* functional dependencies support only [Var = Const] */
+			Assert(bms_num_members(attnums) == 1);
 			mvclauses[nmvclauses] = clause;
-			mvattnums[nmvclauses] = attnum;
+			mvattnums[nmvclauses] = bms_singleton_member(attnums);
 			nmvclauses++;
 
-			clause_attnums = bms_add_member(clause_attnums, attnum);
+			clause_attnums = bms_add_member(clause_attnums,
+											bms_singleton_member(attnums));
 		}
 	}
 
@@ -1430,3 +1992,446 @@ clauselist_apply_dependencies(PlannerInfo *root, List *clauses, Oid varRelid,
 
 	return reduced_clauses;
 }
+
+/*
+ * Estimate selectivity of clauses using a MCV list.
+ *
+ * If there's no MCV list for the stats, the function returns 0.0.
+ *
+ * While computing the estimate, the function checks whether all the
+ * columns were matched with an equality condition. If that's the case,
+ * we can skip processing the histogram, as there can be no rows in
+ * it with the same values - all the rows matching the condition are
+ * represented by the MCV item. This can only happen with equality
+ * on all the attributes.
+ *
+ * The algorithm works like this:
+ *
+ *   1) mark all items as 'match'
+ *   2) walk through all the clauses
+ *   3) for a particular clause, walk through all the items
+ *   4) skip items that are already 'no match'
+ *   5) check clause for items that still match
+ *   6) sum frequencies for items to get selectivity
+ *
+ * The function also returns the frequency of the least frequent item
+ * on the MCV list, which may be useful for clamping estimate from the
+ * histogram (all items not present in the MCV list are less frequent).
+ * This however seems useful only for cases with conditions on all
+ * attributes.
+ *
+ * TODO This only handles AND-ed clauses, but it might work for OR-ed
+ *      lists too - it just needs to reverse the logic a bit. I.e. start
+ *      with 'no match' for all items, and mark the items as a match
+ *      as the clauses are processed (and skip items that are 'match').
+ */
+static Selectivity
+clauselist_mv_selectivity_mcvlist(PlannerInfo *root, List *clauses,
+								  MVStats mvstats, bool *fullmatch,
+								  Selectivity *lowsel)
+{
+	int i;
+	Selectivity s = 0.0;
+	MCVList mcvlist = NULL;
+	int	nmatches = 0;
+
+	/* match/mismatch bitmap for each MCV item */
+	char * matches = NULL;
+
+	Assert(clauses != NIL);
+	Assert(list_length(clauses) >= 2);
+
+	/* there's no MCV list built yet */
+	if (! mvstats->mcv_built)
+		return 0.0;
+
+	mcvlist = deserialize_mv_mcvlist(fetch_mv_mcvlist(mvstats->mvoid));
+
+	Assert(mcvlist != NULL);
+	Assert(mcvlist->nitems > 0);
+
+	/* by default all the MCV items match the clauses fully */
+	matches = palloc0(sizeof(char) * mcvlist->nitems);
+	memset(matches, MVSTATS_MATCH_FULL, sizeof(char)*mcvlist->nitems);
+
+	/* number of matching MCV items */
+	nmatches = mcvlist->nitems;
+
+	nmatches = update_match_bitmap_mcvlist(root, clauses,
+										   mvstats->stakeys, mcvlist,
+										   nmatches, matches,
+										   lowsel, fullmatch, false);
+
+	/* sum frequencies for all the matching MCV items */
+	for (i = 0; i < mcvlist->nitems; i++)
+	{
+		if (matches[i] != MVSTATS_MATCH_NONE)
+			s += mcvlist->items[i]->frequency;
+	}
+
+	pfree(matches);
+	pfree(mcvlist);
+
+	return s;
+}
+
+/*
+ * Evaluate clauses using the MCV list, and update the match bitmap.
+ *
+ * The bitmap may be already partially set, so this is really a way to
+ * combine results of several clause lists - either when computing
+ * conditional probability P(A|B) or a combination of AND/OR clauses.
+ *
+ * TODO This works with 'bitmap' where each bit is represented as a char,
+ *      which is slightly wasteful. Instead, we could use a regular
+ *      bitmap, reducing the size to ~1/8. Another thing is merging the
+ *      bitmaps using & and |, which might be faster than min/max.
+ */
+static int
+update_match_bitmap_mcvlist(PlannerInfo *root, List *clauses,
+						   int2vector *stakeys, MCVList mcvlist,
+						   int nmatches, char * matches,
+						   Selectivity *lowsel, bool *fullmatch,
+						   bool is_or)
+{
+	int i;
+	ListCell * l;
+
+	Bitmapset *eqmatches = NULL;	/* attributes with equality matches */
+
+	/* The bitmap may be partially built. */
+	Assert(nmatches >= 0);
+	Assert(nmatches <= mcvlist->nitems);
+	Assert(clauses != NIL);
+	Assert(list_length(clauses) >= 1);
+	Assert(mcvlist != NULL);
+	Assert(mcvlist->nitems > 0);
+
+	/* No possible matches (only works for AND-ded clauses) */
+	if (((nmatches == 0) && (! is_or)) ||
+		((nmatches == mcvlist->nitems) && is_or))
+		return nmatches;
+
+	/* frequency of the lowest MCV item */
+	*lowsel = 1.0;
+
+	/*
+	 * Loop through the list of clauses, and for each of them evaluate
+	 * all the MCV items not yet eliminated by the preceding clauses.
+	 *
+	 * FIXME This would probably deserve a refactoring, I guess. Unify
+	 *       the two loops and put the checks inside, or something like
+	 *       that.
+	 */
+	foreach (l, clauses)
+	{
+		Node * clause = (Node*)lfirst(l);
+
+		/* if it's a RestrictInfo, then extract the clause */
+		if (IsA(clause, RestrictInfo))
+			clause = (Node*)((RestrictInfo*)clause)->clause;
+
+		/* if there are no remaining matches possible, we can stop */
+		if (((nmatches == 0) && (! is_or)) ||
+			((nmatches == mcvlist->nitems) && is_or))
+				break;
+
+		/* it's either OpClause, or NullTest */
+		if (is_opclause(clause))
+		{
+			OpExpr * expr = (OpExpr*)clause;
+			bool		varonleft = true;
+			bool		ok;
+
+			/* operator */
+			FmgrInfo	opproc;
+
+			fmgr_info(get_opcode(expr->opno), &opproc);
+
+			ok = (NumRelids(clause) == 1) &&
+				 (is_pseudo_constant_clause(lsecond(expr->args)) ||
+				 (varonleft = false,
+				  is_pseudo_constant_clause(linitial(expr->args))));
+
+			if (ok)
+			{
+
+				FmgrInfo	ltproc, gtproc;
+				Var * var = (varonleft) ? linitial(expr->args) : lsecond(expr->args);
+				Const * cst = (varonleft) ? lsecond(expr->args) : linitial(expr->args);
+				bool isgt = (! varonleft);
+
+				/*
+				 * TODO Fetch only when really needed (probably for equality only)
+				 * TODO Technically either lt/gt is sufficient.
+				 *
+				 * FIXME The code in analyze.c creates histograms only for types
+				 *       with enough ordering (by calling get_sort_group_operators).
+				 *       Is this the same assumption, i.e. are we certain that we
+				 *       get the ltproc/gtproc every time we ask? Or are there types
+				 *       where get_sort_group_operators returns ltopr and here we
+				 *       get nothing?
+				 */
+				TypeCacheEntry *typecache
+					= lookup_type_cache(var->vartype,
+										TYPECACHE_EQ_OPR | TYPECACHE_LT_OPR | TYPECACHE_GT_OPR);
+
+				/* FIXME proper matching attribute to dimension */
+				int idx = mv_get_index(var->varattno, stakeys);
+
+				fmgr_info(get_opcode(typecache->lt_opr), &ltproc);
+				fmgr_info(get_opcode(typecache->gt_opr), &gtproc);
+
+				/*
+				 * Walk through the MCV items and evaluate the current clause. We can
+				 * skip items that were already ruled out, and terminate if there are
+				 * no remaining MCV items that might possibly match.
+				 */
+				for (i = 0; i < mcvlist->nitems; i++)
+				{
+					bool mismatch = false;
+					MCVItem item = mcvlist->items[i];
+
+					/*
+					 * find the lowest selectivity in the MCV
+					 * FIXME Maybe not the best place do do this (in for all clauses).
+					 */
+					if (item->frequency < *lowsel)
+						*lowsel = item->frequency;
+
+					/*
+					 * If there are no more matches (AND) or no remaining unmatched
+					 * items (OR), we can stop processing this clause.
+					 */
+					if (((nmatches == 0) && (! is_or)) ||
+						((nmatches == mcvlist->nitems) && is_or))
+						break;
+
+					/*
+					 * For AND-lists, we can also mark NULL items as 'no match' (and
+					 * then skip them). For OR-lists this is not possible.
+					 */
+					if ((! is_or) && item->isnull[idx])
+						matches[i] = MVSTATS_MATCH_NONE;
+
+					/* skip MCV items that were already ruled out */
+					if ((! is_or) && (matches[i] == MVSTATS_MATCH_NONE))
+						continue;
+					else if (is_or && (matches[i] == MVSTATS_MATCH_FULL))
+						continue;
+
+					/* TODO consider bsearch here (list is sorted by values)
+					 * TODO handle other operators too (LT, GT)
+					 * TODO identify "full match" when the clauses fully
+					 *      match the whole MCV list (so that checking the
+					 *      histogram is not needed)
+					 */
+					if (get_oprrest(expr->opno) == F_EQSEL)
+					{
+						/*
+						 * We don't care about isgt in equality, because it does not
+						 * matter whether it's (var = const) or (const = var).
+						 */
+						bool match = DatumGetBool(FunctionCall2Coll(&opproc,
+															 DEFAULT_COLLATION_OID,
+															 cst->constvalue,
+															 item->values[idx]));
+
+						if (match)
+							eqmatches = bms_add_member(eqmatches, idx);
+
+						mismatch = (! match);
+					}
+					else if (get_oprrest(expr->opno) == F_SCALARLTSEL)	/* column < constant */
+					{
+
+						if (! isgt)	/* (var < const) */
+						{
+							/*
+							 * First check whether the constant is below the lower boundary (in that
+							 * case we can skip the bucket, because there's no overlap).
+							 */
+							mismatch = DatumGetBool(FunctionCall2Coll(&opproc,
+																 DEFAULT_COLLATION_OID,
+																 cst->constvalue,
+																 item->values[idx]));
+
+						} /* (get_oprrest(expr->opno) == F_SCALARLTSEL) */
+						else	/* (const < var) */
+						{
+							/*
+							 * First check whether the constant is above the upper boundary (in that
+							 * case we can skip the bucket, because there's no overlap).
+							 */
+							mismatch = DatumGetBool(FunctionCall2Coll(&opproc,
+																 DEFAULT_COLLATION_OID,
+																 item->values[idx],
+																 cst->constvalue));
+						}
+					}
+					else if (get_oprrest(expr->opno) == F_SCALARGTSEL)	/* column > constant */
+					{
+
+						if (! isgt)	/* (var > const) */
+						{
+							/*
+							 * First check whether the constant is above the upper boundary (in that
+							 * case we can skip the bucket, because there's no overlap).
+							 */
+							mismatch = DatumGetBool(FunctionCall2Coll(&opproc,
+																 DEFAULT_COLLATION_OID,
+																 cst->constvalue,
+																 item->values[idx]));
+						}
+						else /* (const > var) */
+						{
+							/*
+							 * First check whether the constant is below the lower boundary (in
+							 * that case we can skip the bucket, because there's no overlap).
+							 */
+							mismatch = DatumGetBool(FunctionCall2Coll(&opproc,
+																 DEFAULT_COLLATION_OID,
+																 item->values[idx],
+																 cst->constvalue));
+						}
+
+					} /* (get_oprrest(expr->opno) == F_SCALARGTSEL) */
+
+					/* XXX The conditions on matches[i] are not needed, as we
+					 *     skip MCV items that can't become true/false, depending
+					 *     on the current flag. See beginning of the loop over
+					 *     MCV items.
+					 */
+
+					if ((is_or) && (matches[i] == MVSTATS_MATCH_NONE) && (! mismatch))
+					{
+						/* OR - was MATCH_NONE, but will be MATCH_FULL */
+						matches[i] = MVSTATS_MATCH_FULL;
+						++nmatches;
+						continue;
+					}
+					else if ((! is_or) && (matches[i] == MVSTATS_MATCH_FULL) && mismatch)
+					{
+						/* AND - was MATC_FULL, but will be MATCH_NONE */
+						matches[i] = MVSTATS_MATCH_NONE;
+						--nmatches;
+						continue;
+					}
+
+				}
+			}
+		}
+		else if (IsA(clause, NullTest))
+		{
+			NullTest * expr = (NullTest*)clause;
+			Var * var = (Var*)(expr->arg);
+
+			/* FIXME proper matching attribute to dimension */
+			int idx = mv_get_index(var->varattno, stakeys);
+
+			/*
+			 * Walk through the MCV items and evaluate the current clause. We can
+			 * skip items that were already ruled out, and terminate if there are
+			 * no remaining MCV items that might possibly match.
+			 */
+			for (i = 0; i < mcvlist->nitems; i++)
+			{
+				MCVItem item = mcvlist->items[i];
+
+				/*
+				 * find the lowest selectivity in the MCV
+				 * FIXME Maybe not the best place do do this (in for all clauses).
+				 */
+				if (item->frequency < *lowsel)
+					*lowsel = item->frequency;
+
+				/* if there are no more matches, we can stop processing this clause */
+				if (nmatches == 0)
+					break;
+
+				/* skip MCV items that were already ruled out */
+				if (matches[i] == MVSTATS_MATCH_NONE)
+					continue;
+
+				/* if the clause mismatches the MCV item, set it as MATCH_NONE */
+				if (((expr->nulltesttype == IS_NULL) && (! mcvlist->items[i]->isnull[idx])) ||
+					((expr->nulltesttype == IS_NOT_NULL) && (mcvlist->items[i]->isnull[idx])))
+				{
+						matches[i] = MVSTATS_MATCH_NONE;
+						--nmatches;
+				}
+			}
+		}
+		else if (or_clause(clause) || and_clause(clause))
+		{
+			/* AND/OR clause, with all clauses compatible with the selected MV stat */
+
+			int			i;
+			BoolExpr   *orclause  = ((BoolExpr*)clause);
+			List	   *orclauses = orclause->args;
+
+			/* match/mismatch bitmap for each MCV item */
+			int	or_nmatches = 0;
+			char * or_matches = NULL;
+
+			Assert(orclauses != NIL);
+			Assert(list_length(orclauses) >= 2);
+
+			/* number of matching MCV items */
+			or_nmatches = mcvlist->nitems;
+
+			/* by default none of the MCV items matches the clauses */
+			or_matches = palloc0(sizeof(char) * or_nmatches);
+
+			if (or_clause(clause))
+			{
+				/* OR clauses assume nothing matches, initially */
+				memset(or_matches, MVSTATS_MATCH_NONE, sizeof(char)*or_nmatches);
+				or_nmatches = 0;
+			}
+			else
+			{
+				/* AND clauses assume nothing matches, initially */
+				memset(or_matches, MVSTATS_MATCH_FULL, sizeof(char)*or_nmatches);
+			}
+
+			/* build the match bitmap for the OR-clauses */
+			or_nmatches = update_match_bitmap_mcvlist(root, orclauses,
+									   stakeys, mcvlist,
+									   or_nmatches, or_matches,
+									   lowsel, fullmatch, or_clause(clause));
+
+			/* merge the bitmap into the existing one*/
+			for (i = 0; i < mcvlist->nitems; i++)
+			{
+				/*
+				 * To AND-merge the bitmaps, a MIN() semantics is used.
+				 * For OR-merge, use MAX().
+				 *
+				 * FIXME this does not decrease the number of matches
+				 */
+				UPDATE_RESULT(matches[i], or_matches[i], is_or);
+			}
+
+			pfree(or_matches);
+
+		}
+		else
+		{
+			elog(ERROR, "unknown clause type: %d", clause->type);
+		}
+	}
+
+	/*
+	 * If all the columns were matched by equality, it's a full match.
+	 * In this case there can be just a single MCV item, matching the
+	 * clause (if there were two, both would match the other one).
+	 */
+	*fullmatch = (bms_num_members(eqmatches) == mcvlist->ndimensions);
+
+	/* free the allocated pieces */
+	if (eqmatches)
+		pfree(eqmatches);
+
+	return nmatches;
+}
diff --git a/src/backend/utils/mvstats/Makefile b/src/backend/utils/mvstats/Makefile
index 099f1ed..3c0aff4 100644
--- a/src/backend/utils/mvstats/Makefile
+++ b/src/backend/utils/mvstats/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mvstats
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = common.o dependencies.o
+OBJS = common.o mcv.o dependencies.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mvstats/common.c b/src/backend/utils/mvstats/common.c
index d44b95a..bd952c6 100644
--- a/src/backend/utils/mvstats/common.c
+++ b/src/backend/utils/mvstats/common.c
@@ -17,8 +17,8 @@
 #include "common.h"
 
 static VacAttrStats ** lookup_var_attr_stats(int2vector *attrs,
-									  int natts, VacAttrStats **vacattrstats);
-
+											 int natts,
+											 VacAttrStats **vacattrstats);
 
 /*
  * Compute requested multivariate stats, using the rows sampled for the
@@ -44,6 +44,8 @@ build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
 	for (i = 0; i < nmvstats; i++)
 	{
 		MVDependencies	deps  = NULL;
+		MCVList		mcvlist   = NULL;
+		int numrows_filtered  = 0;
 
 		/* int2 vector of attnums the stats should be computed on */
 		int2vector * attrs = mvstats[i].stakeys;
@@ -60,8 +62,12 @@ build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
 		if (mvstats->deps_enabled)
 			deps = build_mv_dependencies(numrows, rows, attrs, stats);
 
+		/* build the MCV list */
+		if (mvstats->mcv_enabled)
+			mcvlist = build_mv_mcvlist(numrows, rows, attrs, stats, &numrows_filtered);
+
 		/* store the histogram / MCV list in the catalog */
-		update_mv_stats(mvstats[i].mvoid, deps);
+		update_mv_stats(mvstats[i].mvoid, deps, mcvlist, attrs, stats);
 	}
 }
 
@@ -143,7 +149,7 @@ list_mv_stats(Oid relid, int *nstats, bool built_only)
 		 * Skip statistics that were not computed yet (if only stats
 		 * that were already built were requested)
 		 */
-		if (built_only && (! stats->deps_built))
+		if (built_only && (! (stats->mcv_built || stats->deps_built)))
 			continue;
 
 		/* double the array size if needed */
@@ -156,7 +162,9 @@ list_mv_stats(Oid relid, int *nstats, bool built_only)
 		result[*nstats].mvoid = HeapTupleGetOid(htup);
 		result[*nstats].stakeys = buildint2vector(stats->stakeys.values, stats->stakeys.dim1);
 		result[*nstats].deps_enabled = stats->deps_enabled;
+		result[*nstats].mcv_enabled = stats->mcv_enabled;
 		result[*nstats].deps_built = stats->deps_built;
+		result[*nstats].mcv_built = stats->mcv_built;
 		*nstats += 1;
 	}
 
@@ -171,7 +179,9 @@ list_mv_stats(Oid relid, int *nstats, bool built_only)
 }
 
 void
-update_mv_stats(Oid mvoid, MVDependencies dependencies)
+update_mv_stats(Oid mvoid,
+				MVDependencies dependencies, MCVList mcvlist,
+				int2vector *attrs, VacAttrStats **stats)
 {
 	HeapTuple	stup,
 				oldtup;
@@ -196,15 +206,26 @@ update_mv_stats(Oid mvoid, MVDependencies dependencies)
 			= PointerGetDatum(serialize_mv_dependencies(dependencies));
 	}
 
+	if (mcvlist != NULL)
+	{
+		bytea * data = serialize_mv_mcvlist(mcvlist, attrs, stats);
+		nulls[Anum_pg_mv_statistic_stamcv -1]    = (data == NULL);
+		values[Anum_pg_mv_statistic_stamcv  - 1] = PointerGetDatum(data);
+	}
+
 	/* always replace the value (either by bytea or NULL) */
 	replaces[Anum_pg_mv_statistic_stadeps -1] = true;
+	replaces[Anum_pg_mv_statistic_stamcv -1] = true;
 
 	/* always change the availability flags */
 	nulls[Anum_pg_mv_statistic_deps_built -1] = false;
+	nulls[Anum_pg_mv_statistic_mcv_built -1] = false;
 
 	replaces[Anum_pg_mv_statistic_deps_built-1] = true;
+	replaces[Anum_pg_mv_statistic_mcv_built  -1] = true;
 
 	values[Anum_pg_mv_statistic_deps_built-1] = BoolGetDatum(dependencies != NULL);
+	values[Anum_pg_mv_statistic_mcv_built  -1] = BoolGetDatum(mcvlist != NULL);
 
 	/* Is there already a pg_mv_statistic tuple for this attribute? */
 	oldtup = SearchSysCache1(MVSTATOID,
@@ -232,6 +253,21 @@ update_mv_stats(Oid mvoid, MVDependencies dependencies)
 	heap_close(sd, RowExclusiveLock);
 }
 
+
+int
+mv_get_index(AttrNumber varattno, int2vector * stakeys)
+{
+	int i, idx = 0;
+	for (i = 0; i < stakeys->dim1; i++)
+	{
+		if (stakeys->values[i] < varattno)
+			idx += 1;
+		else
+			break;
+	}
+	return idx;
+}
+
 /* multi-variate stats comparator */
 
 /*
@@ -242,11 +278,15 @@ update_mv_stats(Oid mvoid, MVDependencies dependencies)
 int
 compare_scalars_simple(const void *a, const void *b, void *arg)
 {
-	Datum		da = *(Datum*)a;
-	Datum		db = *(Datum*)b;
-	SortSupport ssup= (SortSupport) arg;
+	return compare_datums_simple(*(Datum*)a,
+								 *(Datum*)b,
+								 (SortSupport)arg);
+}
 
-	return ApplySortComparator(da, false, db, false, ssup);
+int
+compare_datums_simple(Datum a, Datum b, SortSupport ssup)
+{
+	return ApplySortComparator(a, false, b, false, ssup);
 }
 
 /*
diff --git a/src/backend/utils/mvstats/common.h b/src/backend/utils/mvstats/common.h
index 6d5465b..f4309f7 100644
--- a/src/backend/utils/mvstats/common.h
+++ b/src/backend/utils/mvstats/common.h
@@ -46,7 +46,15 @@ typedef struct
 	Datum		value;			/* a data value */
 	int			tupno;			/* position index for tuple it came from */
 } ScalarItem;
- 
+
+/* (de)serialization info */
+typedef struct DimensionInfo {
+	int		nvalues;	/* number of deduplicated values */
+	int		nbytes;		/* number of bytes (serialized) */
+	int		typlen;		/* pg_type.typlen */
+	bool	typbyval;	/* pg_type.typbyval */
+} DimensionInfo;
+
 /* multi-sort */
 typedef struct MultiSortSupportData {
 	int				ndims;		/* number of dimensions supported by the */
@@ -71,5 +79,6 @@ int multi_sort_compare_dim(int dim, const SortItem *a,
 						   const SortItem *b, MultiSortSupport mss);
 
 /* comparators, used when constructing multivariate stats */
+int compare_datums_simple(Datum a, Datum b, SortSupport ssup);
 int compare_scalars_simple(const void *a, const void *b, void *arg);
 int compare_scalars_partition(const void *a, const void *b, void *arg);
diff --git a/src/backend/utils/mvstats/mcv.c b/src/backend/utils/mvstats/mcv.c
new file mode 100644
index 0000000..4466cee
--- /dev/null
+++ b/src/backend/utils/mvstats/mcv.c
@@ -0,0 +1,1002 @@
+/*-------------------------------------------------------------------------
+ *
+ * mcv.c
+ *	  POSTGRES multivariate MCV lists
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mvstats/mcv.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "common.h"
+
+/*
+ * Multivariate MCVs (most-common values lists) are a straightforward
+ * extension of regular MCV list by tracking combinations of values for
+ * several attributes (columns), including NULL flags, and frequency
+ * of the combination.
+ *
+ * For columns small number of distinct values, this works quite well
+ * and may represent the distribution pretty exactly. For columns with
+ * large number of distinct values (e.g. stored as FLOAT), this does
+ * not work that well.
+ *
+ * If we can represent the distribution as a MCV list, we can estimate
+ * some clauses (e.g. equality clauses) much accurately than using
+ * histograms for example.
+ *
+ * Discrete distributions are also easier to combine into a larger
+ * distribution (but this is not yet implemented).
+ *
+ *
+ * TODO For types that don't reasonably support ordering (either because
+ *      the type does not support that or when the user adds some option
+ *      to the ADD STATISTICS command - e.g. UNSORTED_STATS), building
+ *      the histogram may be pointless and inefficient. This is esp.
+ *      true for varlena types that may be quite large and a large MCV
+ *      list may be a better choice, because it makes equality estimates
+ *      more accurate. Due to the unsorted nature, range queries on those
+ *      attributes are rather useless anyway.
+ *
+ *      Another thing is that by restricting to MCV list and equality
+ *      conditions, we can use hash values instead of long varlena values.
+ *      The equality estimation will be very accurate.
+ *
+ *      This however complicates matching the columns to available
+ *      statistics, as it will require matching clauses (not columns) to
+ *      stats. And it may get quite complex - e.g. what if there are
+ *      multiple clauses, each compatible with different stats subset?
+ *
+ *
+ * Selectivity estimation
+ * ----------------------
+ * The estimation, implemented in clauselist_mv_selectivity_mcvlist(),
+ * is quite simple in principle - walk through the MCV items and sum
+ * frequencies of all the items that match all the clauses.
+ *
+ * The current implementation uses MCV lists to estimates those types
+ * of clauses (think of WHERE conditions):
+ *
+ *  (a) equality clauses    WHERE (a = 1) AND (b = 2)
+ *
+ *  (b) inequality clauses  WHERE (a < 1) AND (b >= 2)
+ *
+ * It's possible to add more clauses, for example:
+ *
+ *  (a) NULL clauses        WHERE (a IS NULL) AND (b IS NOT NULL)
+ *
+ *  (b) multi-var clauses   WHERE (a > b)
+ *
+ * and so on. These are tasks for the future, not yet implemented.
+ *
+ *
+ * Estimating equality clauses
+ * ---------------------------
+ * When computing selectivity estimate for equality clauses
+ *
+ *      (a = 1) AND (b = 2)
+ *
+ * we can do this estimate pretty exactly assuming that two conditions
+ * are met:
+ *
+ *      (1) there's an equality condition on each attribute
+ *
+ *      (2) we find a matching item in the MCV list
+ *
+ * In that case we know the MCV item represents all the tuples matching
+ * the clauses, and the selectivity estimate is complete. This is what
+ * we call 'full match'.
+ *
+ * When only (1) holds, but there's no matching MCV item, we don't know
+ * whether there are no such rows or just are not very frequent. We can
+ * however use the frequency of the least frequent MCV item as an upper
+ * bound for the selectivity.
+ *
+ * If the equality conditions match only a subset of the attributes
+ * the MCV list is built on (i.e. we can't get a full match - we may get
+ * multiple MCV items matching the clauses, but even if we get a single
+ * match there may be items that did not get into the MCV list. But in
+ * this case we can still use the frequency of the last MCV item to clam
+ * the 'additional' selectivity not accounted for by the matching items.
+ *
+ * If there's no histogram, because the MCV list approximates the
+ * distribution accurately (not because the histogram was disabled),
+ * it does not really matter whether there are equality conditions on
+ * all the columns - we can do pretty accurate estimation using the MCV.
+ *
+ * TODO For a combination of equality conditions (not full-match case)
+ *      we probably can clamp the selectivity by the minimum of
+ *      selectivities for each condition. For example if we know the
+ *      number of distinct values for each column, we can use 1/ndistinct
+ *      as a per-column estimate. Or rather 1/ndistinct + selectivity
+ *      derived from the MCV list.
+ *
+ *      If we know the estimate of number of combinations of the columns
+ *      (i.e. ndistinct(A,B)), we may estimate the average frequency of
+ *      items in the remaining 10% as [10% / ndistinct(A,B)].
+ *
+ *
+ * Bounding estimates
+ * ------------------
+ * In general the MCV lists may not provide estimates as accurate as
+ * for the full-match equality case, but may provide some useful
+ * lower/upper boundaries for the estimation error.
+ *
+ * With equality clauses we can do a few more tricks to narrow this
+ * error range (see the previous section and TODO), but with inequality
+ * clauses (or generally non-equality clauses), it's rather dificult.
+ * There's nothing like a 'full match' - we have to consider both the
+ * MCV items and the remaining part every time. We can't use the minimum
+ * selectivity of MCV items, as the clauses may match multiple items.
+ *
+ * For example with a MCV list on columns (A, B), covering 90% of the
+ * table (computed while building the MCV list), about ~10% of the table
+ * is not represented by the MCV list. So even if the conditions match
+ * all the remaining rows (not represented by the MCV items), we can't
+ * get selectivity higher than those 10%. We may use 1/2 the remaining
+ * selectivity as an estimate (minimizing average error).
+ *
+ * TODO Most of these ideas (error limiting) are not yet implemented.
+ *
+ *
+ * General TODO
+ * ------------
+ *
+ * FIXME Use max_mcv_items from ALTER TABLE ADD STATISTICS command.
+ *
+ * TODO Add support for IS [NOT] NULL clauses, and clauses referencing
+ *      multiple columns (a < b).
+ *
+ * TODO It's possible to build a special case of MCV list, storing not
+ *      the actual values but only 32/64-bit hash. This is only useful
+ *      for estimating equality clauses and for large varlena types,
+ *      which are very impractical for plain MCV list because of size.
+ *      But for those data types we really want just the equality
+ *      clauses, so it's actually a good solution.
+ *
+ * TODO Currently there's no logic to consider building only a MCV list
+ *      (and not building the histogram at all), except for doing this
+ *      decision manually in ADD STATISTICS.
+ */
+
+/*
+ * Each serialized item needs to store (in this order):
+ *
+ * - indexes              (ndim * sizeof(int32))
+ * - null flags           (ndim * sizeof(bool))
+ * - frequency            (sizeof(double))
+ *
+ * So in total:
+ *
+ *   ndim * (sizeof(int32) + sizeof(bool)) + sizeof(double)
+ */
+#define ITEM_SIZE(ndims)	\
+	(ndims * (sizeof(int32) + sizeof(bool)) + sizeof(double))
+
+/* pointers into a flat serialized item of ITEM_SIZE(n) bytes */
+#define ITEM_INDEXES(item)			((int32*)item)
+#define ITEM_NULLS(item,ndims)		((bool*)(ITEM_INDEXES(item) + ndims))
+#define ITEM_FREQUENCY(item,ndims)	((double*)(ITEM_NULLS(item,ndims) + ndims))
+
+/*
+ * Builds MCV list from sample rows, and removes rows represented by
+ * the MCV list from the sample (the number of remaining sample rows is
+ * returned by the numrows_filtered parameter).
+ *
+ * The method is quite simple - in short it does about these steps:
+ *
+ *       (1) sort the data (default collation, '<' for the data type)
+ *
+ *       (2) count distinct groups, decide how many to keep
+ *
+ *       (3) build the MCV list using the threshold determined in (2)
+ *
+ *       (4) remove rows represented by the MCV from the sample
+ *
+ * For more details, see the comments in the code.
+ *
+ * FIXME Single-dimensional MCV is sorted by frequency (descending). We
+ *       should do that too, because when walking through the list we
+ *       want to check the most frequent items first.
+ *
+ * TODO We're using Datum (8B), even for data types (e.g. int4 or
+ *      float4). Maybe we could save some space here, but the bytea
+ *      compression should handle it just fine.
+ *
+ * TODO This probably should not use the ndistinct directly (as computed
+ *      from the table, but rather estimate the number of distinct
+ *      values in the table), no?
+ */
+MCVList
+build_mv_mcvlist(int numrows, HeapTuple *rows, int2vector *attrs,
+					  VacAttrStats **stats, int *numrows_filtered)
+{
+	int i, j;
+	int numattrs = attrs->dim1;
+	int ndistinct = 0;
+	int mcv_threshold = 0;
+	int count = 0;
+	int nitems = 0;
+
+	MCVList	mcvlist = NULL;
+
+	/* Sort by multiple columns (using array of SortSupport) */
+	MultiSortSupport mss = multi_sort_init(numattrs);
+
+	/*
+	 * Preallocate space for all the items as a single chunk, and point
+	 * the items to the appropriate parts of the array.
+	 */
+	SortItem   *items  = (SortItem*)palloc0(numrows * sizeof(SortItem));
+	Datum	   *values = (Datum*)palloc0(sizeof(Datum) * numrows * numattrs);
+	bool	   *isnull = (bool*)palloc0(sizeof(bool) * numrows * numattrs);
+
+	/* keep all the rows by default (as if there was no MCV list) */
+	*numrows_filtered = numrows;
+
+	for (i = 0; i < numrows; i++)
+	{
+		items[i].values = &values[i * numattrs];
+		items[i].isnull = &isnull[i * numattrs];
+	}
+
+	/* load the values/null flags from sample rows */
+	for (j = 0; j < numrows; j++)
+		for (i = 0; i < numattrs; i++)
+			items[j].values[i] = heap_getattr(rows[j], attrs->values[i],
+								stats[i]->tupDesc, &items[j].isnull[i]);
+
+	/* prepare the sort functions for all the attributes */
+	for (i = 0; i < numattrs; i++)
+		multi_sort_add_dimension(mss, i, i, stats);
+
+	/* do the sort, using the multi-sort */
+	qsort_arg((void *) items, numrows, sizeof(SortItem),
+				multi_sort_compare, mss);
+
+	/*
+	 * Count the number of distinct groups - just walk through the
+	 * sorted list and count the number of key changes. We use this to
+	 * determine the threshold (125% of the average frequency).
+	 */
+	ndistinct = 1;
+	for (i = 1; i < numrows; i++)
+		if (multi_sort_compare(&items[i], &items[i-1], mss) != 0)
+			ndistinct += 1;
+
+	/*
+	 * Determine how many groups actually exceed the threshold, and then
+	 * walk the array again and collect them into an array. We'll always
+	 * require at least 4 rows per group.
+	 *
+	 * But if we can fit all the distinct values in the MCV list (i.e.
+	 * if there are less distinct groups than MVSTAT_MCVLIST_MAX_ITEMS),
+	 * we'll require only 2 rows per group.
+	 *
+	 * TODO For now the threshold is the same as in the single-column
+	 *      case (average + 25%), but maybe that's worth revisiting
+	 *      for the multivariate case.
+	 *
+	 * TODO We can do this only if we believe we got all the distinct
+	 *      values of the table.
+	 *
+	 * FIXME This should really reference mcv_max_items (from catalog)
+	 *       instead of the constant MVSTAT_MCVLIST_MAX_ITEMS.
+	 */
+	mcv_threshold = 1.25 * numrows / ndistinct;
+	mcv_threshold = (mcv_threshold < 4) ? 4  : mcv_threshold;
+
+	if (ndistinct <= MVSTAT_MCVLIST_MAX_ITEMS)
+		mcv_threshold = 2;
+
+	/*
+	 * Walk through the sorted data again, and see how many groups
+	 * reach the mcv_threshold (and become an item in the MCV list).
+	 */
+	count = 1;
+	for (i = 1; i <= numrows; i++)
+	{
+		/* last row or new group, so check if we exceed  mcv_threshold */
+		if ((i == numrows) || (multi_sort_compare(&items[i], &items[i-1], mss) != 0))
+		{
+			/* group hits the threshold, count the group as MCV item */
+			if (count >= mcv_threshold)
+				nitems += 1;
+
+			count = 1;
+		}
+		else	/* within group, so increase the number of items */
+			count += 1;
+	}
+
+	/* we know the number of MCV list items, so let's build the list */
+	if (nitems > 0)
+	{
+		/* allocate the MCV list structure, set parameters we know */
+		mcvlist = (MCVList)palloc0(sizeof(MCVListData));
+
+		mcvlist->magic = MVSTAT_MCV_MAGIC;
+		mcvlist->type = MVSTAT_MCV_TYPE_BASIC;
+		mcvlist->ndimensions = numattrs;
+		mcvlist->nitems = nitems;
+
+		/*
+		 * Preallocate Datum/isnull arrays (not as a single chunk, as
+		 * we'll pass this outside this method and thus it needs to be
+		 * easy to pfree() the data (and we wouldn't know where the
+		 * arrays start).
+		 *
+		 * TODO Maybe the reasoning that we can't allocate a single
+		 *      piece because we're passing it out is bogus? Who'd
+		 *      free a single item of the MCV list, anyway?
+		 *
+		 * TODO Maybe with a proper encoding (stuffing all the values
+		 *      into a list-level array, this will be untrue)?
+		 */
+		mcvlist->items = (MCVItem*)palloc0(sizeof(MCVItem)*nitems);
+
+		for (i = 0; i < nitems; i++)
+		{
+			mcvlist->items[i] = (MCVItem)palloc0(sizeof(MCVItemData));
+			mcvlist->items[i]->values = (Datum*)palloc0(sizeof(Datum)*numattrs);
+			mcvlist->items[i]->isnull = (bool*)palloc0(sizeof(bool)*numattrs);
+		}
+
+		/*
+		 * Repeat the same loop as above, but this time copy the data
+		 * into the MCV list (for items exceeding the threshold).
+		 *
+		 * TODO Maybe we could simply remember indexes of the last item
+		 *      in each group (from the previous loop)?
+		 */
+		count = 1;
+		nitems = 0;
+		for (i = 1; i <= numrows; i++)
+		{
+			/* last row or a new group */
+			if ((i == numrows) || (multi_sort_compare(&items[i], &items[i-1], mss) != 0))
+			{
+				/* count the MCV item if exceeding the threshold (and copy into the array) */
+				if (count >= mcv_threshold)
+				{
+					/* just pointer to the proper place in the list */
+					MCVItem item = mcvlist->items[nitems];
+
+					/* copy values from the _previous_ group (last item of) */
+					memcpy(item->values, items[(i-1)].values, sizeof(Datum) * numattrs);
+					memcpy(item->isnull, items[(i-1)].isnull, sizeof(bool)  * numattrs);
+
+
+					/* and finally the group frequency */
+					item->frequency = (double)count / numrows;
+
+					/* next item */
+					nitems += 1;
+				}
+
+				count = 1;
+			}
+			else	/* same group, just increase the number of items */
+				count += 1;
+		}
+
+		/* make sure the loops are consistent */
+		Assert(nitems == mcvlist->nitems);
+
+		/*
+		 * Remove the rows matching the MCV list (i.e. keep only rows
+		 * that are not represented by the MCV list).
+		 *
+		 * FIXME This implementation is rather naive, effectively O(N^2).
+		 *       As the MCV list grows, the check will take longer and
+		 *       longer. And as the number of sampled rows increases (by
+		 *       increasing statistics target), it will take longer and
+		 *       longer. One option is to sort the MCV items first and
+		 *       then perform a binary search.
+		 *
+		 *       A better option would be keeping the ID of the row in
+		 *       the sort item, and then just walk through the items and
+		 *       mark rows to remove (in a bitmap of the same size).
+		 *       There's not space for that in SortItem at this moment,
+		 *       but it's trivial to add 'private' pointer, or just
+		 *       using another structure with extra field (starting with
+		 *       SortItem, so that the comparators etc. still work).
+		 *
+		 *       Another option is to use the sorted array of items
+		 *       (because that's how we sorted the source data), and
+		 *       simply do a bsearch() into it. If we find a matching
+		 *       item, the row belongs to the MCV list.
+		 */
+		if (nitems == ndistinct) /* all rows are covered by MCV items */
+			*numrows_filtered = 0;
+		else /* (nitems < ndistinct) && (nitems > 0) */
+		{
+			int nfiltered = 0;
+			HeapTuple *rows_filtered = (HeapTuple*)palloc0(sizeof(HeapTuple) * numrows);
+
+			/* used for the searches */
+			SortItem item, mcvitem;;
+
+			item.values = (Datum*)palloc0(numattrs * sizeof(Datum));
+			item.isnull = (bool*)palloc0(numattrs * sizeof(bool));
+
+			/*
+			 * FIXME we don't need to allocate this, we can reference
+			 *       the MCV item directly ...
+			 */
+			mcvitem.values = (Datum*)palloc0(numattrs * sizeof(Datum));
+			mcvitem.isnull = (bool*)palloc0(numattrs * sizeof(bool));
+
+			/* walk through the tuples, compare the values to MCV items */
+			for (i = 0; i < numrows; i++)
+			{
+				bool	match = false;
+
+				/* collect the key values from the row */
+				for (j = 0; j < numattrs; j++)
+					item.values[j] = heap_getattr(rows[i], attrs->values[j],
+										stats[j]->tupDesc, &item.isnull[j]);
+
+				/* scan through the MCV list for matches */
+				for (j = 0; j < mcvlist->nitems; j++)
+				{
+					/*
+					 * TODO Create a SortItem/MCVItem comparator so that
+					 *      we don't need to do memcpy() like crazy.
+					 */
+					memcpy(mcvitem.values, mcvlist->items[j]->values,
+							numattrs * sizeof(Datum));
+					memcpy(mcvitem.isnull, mcvlist->items[j]->isnull,
+							numattrs * sizeof(bool));
+
+					if (multi_sort_compare(&item, &mcvitem, mss) == 0)
+					{
+						match = true;
+						break;
+					}
+				}
+
+				/* if no match in the MCV list, copy the row into the filtered ones */
+				if (! match)
+					memcpy(&rows_filtered[nfiltered++], &rows[i], sizeof(HeapTuple));
+			}
+
+			/* replace the rows and remember how many rows we kept */
+			memcpy(rows, rows_filtered, sizeof(HeapTuple) * nfiltered);
+			*numrows_filtered = nfiltered;
+
+			/* free all the data used here */
+			pfree(rows_filtered);
+			pfree(item.values);
+			pfree(item.isnull);
+			pfree(mcvitem.values);
+			pfree(mcvitem.isnull);
+		}
+	}
+
+	pfree(values);
+	pfree(items);
+	pfree(isnull);
+
+	return mcvlist;
+}
+
+
+/* fetch the MCV list (as a bytea) from the pg_mv_statistic catalog */
+bytea *
+fetch_mv_mcvlist(Oid mvoid)
+{
+	Relation	indrel;
+	SysScanDesc indscan;
+	ScanKeyData skey;
+	HeapTuple	htup;
+	bytea	   *mcvlist = NULL;
+
+	/* Prepare to scan pg_mv_statistic for entries having indrelid = this rel. */
+	ScanKeyInit(&skey,
+				ObjectIdAttributeNumber,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(mvoid));
+
+	indrel = heap_open(MvStatisticRelationId, AccessShareLock);
+	indscan = systable_beginscan(indrel, MvStatisticOidIndexId, true,
+								 NULL, 1, &skey);
+
+	while (HeapTupleIsValid(htup = systable_getnext(indscan)))
+	{
+		bool isnull = false;
+		Datum tmp  = SysCacheGetAttr(MVSTATOID, htup,
+								   Anum_pg_mv_statistic_stamcv, &isnull);
+
+		Assert(!isnull);
+
+		mcvlist = DatumGetByteaP(tmp);
+
+		break;
+	}
+
+	systable_endscan(indscan);
+
+	heap_close(indrel, AccessShareLock);
+
+	/* TODO maybe save the list into relcache, as in RelationGetIndexList
+	 *      (which was used as an inspiration of this one)?. */
+
+	return mcvlist;
+}
+
+/* print some basic info about the MCV list
+ *
+ * TODO Add info about what part of the table this covers.
+ */
+Datum
+pg_mv_stats_mcvlist_info(PG_FUNCTION_ARGS)
+{
+	bytea	   *data = PG_GETARG_BYTEA_P(0);
+	char	   *result;
+
+	MCVList	mcvlist = deserialize_mv_mcvlist(data);
+
+	result = palloc0(128);
+	snprintf(result, 128, "nitems=%d", mcvlist->nitems);
+
+	pfree(mcvlist);
+
+	PG_RETURN_TEXT_P(cstring_to_text(result));
+}
+
+/* used to pass context into bsearch() */
+static SortSupport ssup_private = NULL;
+
+static int bsearch_comparator(const void * a, const void * b);
+
+/*
+ * Serialize MCV list into a bytea value. The basic algorithm is simple:
+ *
+ * (1) perform deduplication for each attribute (separately)
+ *     (a) collect all (non-NULL) attribute values from all MCV items
+ *     (b) sort the data (using 'lt' from VacAttrStats)
+ *     (c) remove duplicate values from the array
+ *
+ * (2) serialize the arrays into a bytea value
+ *
+ * (3) process all MCV list items
+ *     (a) replace values with indexes into the arrays
+ *
+ * Each attribute has to be processed separately, because we're mixing
+ * different datatypes, and we don't know what equality means for them.
+ * We're also mixing pass-by-value and pass-by-ref types, and so on.
+ *
+ * We'll use 32-bit values for the indexes in step (3), although we
+ * could probably use just 16 bits as we don't allow more than 8k
+ * items in the MCV list max_mcv_items (well, we might increase this to
+ * 32k and still fit into signed 16-bits). But let's be lazy and rely
+ * on the varlena compression to kick in. If most bytes will be 0x00
+ * so it should work nicely.
+ *
+ * FIXME This probably leaks memory, or at least uses it inefficiently
+ *       (many small palloc() calls instead of a large one).
+ *
+ * TODO Consider using 16-bit values for the indexes in step (3).
+ *
+ * TODO Consider packing boolean flags (NULL) for each item into 'char'
+ *      or a longer type (instead of using an array of bool items).
+ */
+bytea *
+serialize_mv_mcvlist(MCVList mcvlist, int2vector *attrs,
+					 VacAttrStats **stats)
+{
+	int	i, j;
+	int	ndims = mcvlist->ndimensions;
+	int	itemsize = ITEM_SIZE(ndims);
+
+	Size	total_length = 0;
+
+	char   *item = palloc0(itemsize);
+
+	/* serialized items (indexes into arrays, etc.) */
+	bytea  *output;
+	char   *data = NULL;
+
+	/* values per dimension (and number of non-NULL values) */
+	Datum **values = (Datum**)palloc0(sizeof(Datum*) * ndims);
+	int	   *counts = (int*)palloc0(sizeof(int) * ndims);
+
+	/* info about dimensions (for deserialize) */
+	DimensionInfo * info
+				= (DimensionInfo *)palloc0(sizeof(DimensionInfo)*ndims);
+
+	/* sort support data */
+	SortSupport	ssup = (SortSupport)palloc0(sizeof(SortSupportData)*ndims);
+
+	/* collect and deduplicate values for each dimension */
+	for (i = 0; i < ndims; i++)
+	{
+		int count;
+		StdAnalyzeData *tmp = (StdAnalyzeData *)stats[i]->extra_data;
+
+		/* keep important info about the data type */
+		info[i].typlen   = stats[i]->attrtype->typlen;
+		info[i].typbyval = stats[i]->attrtype->typbyval;
+
+		/* allocate space for all values, including NULLs (won't use them) */
+		values[i] = (Datum*)palloc0(sizeof(Datum) * mcvlist->nitems);
+
+		for (j = 0; j < mcvlist->nitems; j++)
+		{
+			if (! mcvlist->items[j]->isnull[i])	/* skip NULL values */
+			{
+				values[i][counts[i]] = mcvlist->items[j]->values[i];
+				counts[i] += 1;
+			}
+		}
+
+		/* there are just NULL values in this dimension */
+		if (counts[i] == 0)
+			continue;
+
+		/* sort and deduplicate */
+		ssup[i].ssup_cxt = CurrentMemoryContext;
+		ssup[i].ssup_collation = DEFAULT_COLLATION_OID;
+		ssup[i].ssup_nulls_first = false;
+
+		PrepareSortSupportFromOrderingOp(tmp->ltopr, &ssup[i]);
+
+		qsort_arg(values[i], counts[i], sizeof(Datum),
+										compare_scalars_simple, &ssup[i]);
+
+		/*
+		 * Walk through the array and eliminate duplicitate values, but
+		 * keep the ordering (so that we can do bsearch later). We know
+		 * there's at least 1 item, so we can skip the first element.
+		 */
+		count = 1;	/* number of deduplicated items */
+		for (j = 1; j < counts[i]; j++)
+		{
+			/* if it's different from the previous value, we need to keep it */
+			if (compare_datums_simple(values[i][j-1], values[i][j], &ssup[i]) != 0)
+			{
+				/* XXX: not needed if (count == j) */
+				values[i][count] = values[i][j];
+				count += 1;
+			}
+		}
+
+		/* keep info about the deduplicated count */
+		info[i].nvalues = count;
+
+		/* compute size of the serialized data */
+		if (info[i].typbyval)
+			/*
+			 * passed by value, so just Datum array (int4, int8, ...)
+			 *
+			 * TODO Might save a few bytes here, by storing just typlen
+			 *      bytes instead of whole Datum (8B) on 64-bits.
+			 */
+			info[i].nbytes = info[i].nvalues * sizeof(Datum);
+		else if (info[i].typlen > 0)
+			/* pased by reference, but fixed length (name, tid, ...) */
+			info[i].nbytes = info[i].nvalues * info[i].typlen;
+		else if (info[i].typlen == -1)
+			/* varlena, so just use VARSIZE_ANY */
+			for (j = 0; j < info[i].nvalues; j++)
+				info[i].nbytes += VARSIZE_ANY(values[i][j]);
+		else if (info[i].typlen == -2)
+			/* cstring, so simply strlen */
+			for (j = 0; j < info[i].nvalues; j++)
+				info[i].nbytes += strlen(DatumGetPointer(values[i][j]));
+		else
+			elog(ERROR, "unknown data type typbyval=%d typlen=%d",
+				info[i].typbyval, info[i].typlen);
+	}
+
+	/*
+	 * Now we finally know how much space we'll need for the serialized
+	 * MCV list, as it contains these fields:
+	 *
+	 * - length (4B) for varlena
+	 * - magic (4B)
+	 * - type (4B)
+	 * - ndimensions (4B)
+	 * - nitems (4B)
+	 * - info (ndim * sizeof(DimensionInfo)
+	 * - arrays of values for each dimension
+	 * - serialized items (nitems * itemsize)
+	 *
+	 * So the 'header' size is 20B + ndim * sizeof(DimensionInfo) and
+	 * then we'll place the data.
+	 */
+	total_length = (sizeof(int32) + offsetof(MCVListData, items)
+					+ ndims * sizeof(DimensionInfo)
+					+ mcvlist->nitems * itemsize);
+
+	for (i = 0; i < ndims; i++)
+		total_length += info[i].nbytes;
+
+	/* enforce arbitrary limit of 1MB */
+	if (total_length > 1024 * 1024)
+		elog(ERROR, "serialized MCV exceeds 1MB (%ld)", total_length);
+
+	/* allocate space for the serialized MCV list, set header fields */
+	output = (bytea*)palloc0(total_length);
+	SET_VARSIZE(output, total_length);
+
+	/* we'll use 'ptr' to keep track of the place to write data */
+	data = VARDATA(output);
+
+	memcpy(data, mcvlist, offsetof(MCVListData, items));
+	data += offsetof(MCVListData, items);
+
+	memcpy(data, info, sizeof(DimensionInfo) * ndims);
+	data += sizeof(DimensionInfo) * ndims;
+
+	/* value array for each dimension */
+	for (i = 0; i < ndims; i++)
+	{
+#ifdef USE_ASSERT_CHECKING
+		char *tmp = data;
+#endif
+		for (j = 0; j < info[i].nvalues; j++)
+		{
+			if (info[i].typbyval)
+			{
+				/* passed by value / Datum */
+				memcpy(data, &values[i][j], sizeof(Datum));
+				data += sizeof(Datum);
+			}
+			else if (info[i].typlen > 0)
+			{
+				/* pased by reference, but fixed length (name, tid, ...) */
+				memcpy(data, &values[i][j], info[i].typlen);
+				data += info[i].typlen;
+			}
+			else if (info[i].typlen == -1)
+			{
+				/* varlena */
+				memcpy(data, DatumGetPointer(values[i][j]),
+							VARSIZE_ANY(values[i][j]));
+				data += VARSIZE_ANY(values[i][j]);
+			}
+			else if (info[i].typlen == -2)
+			{
+				/* cstring (don't forget the \0 terminator!) */
+				memcpy(data, DatumGetPointer(values[i][j]),
+							strlen(DatumGetPointer(values[i][j])) + 1);
+				data += strlen(DatumGetPointer(values[i][j])) + 1;
+			}
+		}
+		Assert((data - tmp) == info[i].nbytes);
+	}
+
+	/* and finally, the MCV items */
+	for (i = 0; i < mcvlist->nitems; i++)
+	{
+		/* don't write beyond the allocated space */
+		Assert(data <= (char*)output + total_length - itemsize);
+
+		/* reset the values for each item */
+		memset(item, 0, itemsize);
+
+		for (j = 0; j < ndims; j++)
+		{
+			/* do the lookup only for non-NULL values */
+			if (! mcvlist->items[i]->isnull[j])
+			{
+				Datum * v = NULL;
+				ssup_private = &ssup[j];
+
+				v = (Datum*)bsearch(&mcvlist->items[i]->values[j],
+								values[j], info[j].nvalues, sizeof(Datum),
+								bsearch_comparator);
+
+				if (v == NULL)
+					elog(ERROR, "value for dim %d not found in array", j);
+
+				/* compute index within the array */
+				ITEM_INDEXES(item)[j] = (v - values[j]);
+
+				/* check the index is within expected bounds */
+				Assert(ITEM_INDEXES(item)[j] >= 0);
+				Assert(ITEM_INDEXES(item)[j] < info[j].nvalues);
+			}
+		}
+
+		/* copy NULL and frequency flags into the item */
+		memcpy(ITEM_NULLS(item, ndims),
+				mcvlist->items[i]->isnull, sizeof(bool) * ndims);
+		memcpy(ITEM_FREQUENCY(item, ndims),
+				&mcvlist->items[i]->frequency, sizeof(double));
+
+		/* copy the item into the array */
+		memcpy(data, item, itemsize);
+
+		data += itemsize;
+	}
+
+	/* at this point we expect to match the total_length exactly */
+	Assert((data - (char*)output) == total_length);
+
+	return output;
+}
+
+/* inverse to serialize_mv_mcvlist() - see the comment there */
+MCVList deserialize_mv_mcvlist(bytea * data)
+{
+	int		i, j;
+	Size	expected_size;
+	MCVList mcvlist;
+	char   *tmp;
+
+	int		ndims, nitems, itemsize;
+	DimensionInfo *info = NULL;
+
+	int32  *indexes = NULL;
+	Datum **values = NULL;
+
+	if (data == NULL)
+		return NULL;
+
+	if (VARSIZE_ANY_EXHDR(data) < offsetof(MCVListData,items))
+		elog(ERROR, "invalid MCV Size %ld (expected at least %ld)",
+			 VARSIZE_ANY_EXHDR(data), offsetof(MCVListData,items));
+
+	/* read the MCV list header */
+	mcvlist = (MCVList)palloc0(sizeof(MCVListData));
+
+	/* initialize pointer to the data part (skip the varlena header) */
+	tmp = VARDATA(data);
+
+	/* get the header and perform basic sanity checks */
+	memcpy(mcvlist, tmp, offsetof(MCVListData,items));
+	tmp += offsetof(MCVListData,items);
+
+	if (mcvlist->magic != MVSTAT_MCV_MAGIC)
+		elog(ERROR, "invalid MCV magic %d (expected %dd)",
+			 mcvlist->magic, MVSTAT_MCV_MAGIC);
+
+	if (mcvlist->type != MVSTAT_MCV_TYPE_BASIC)
+		elog(ERROR, "invalid MCV type %d (expected %dd)",
+			 mcvlist->type, MVSTAT_MCV_TYPE_BASIC);
+
+	nitems = mcvlist->nitems;
+	ndims = mcvlist->ndimensions;
+	itemsize = ITEM_SIZE(ndims);
+
+	Assert(nitems > 0);
+	Assert((ndims >= 2) && (ndims <= MVSTATS_MAX_DIMENSIONS));
+
+	/*
+	 * What size do we expect with those parameters (it's incomplete,
+	 * as we yet have to count the array sizes (from DimensionInfo
+	 * records).
+	 */
+	expected_size = offsetof(MCVListData,items) +
+					ndims * sizeof(DimensionInfo) +
+					(nitems * itemsize);
+
+	/* check that we have at least the DimensionInfo records */
+	if (VARSIZE_ANY_EXHDR(data) < expected_size)
+		elog(ERROR, "invalid MCV Size %ld (expected %ld)",
+			 VARSIZE_ANY_EXHDR(data), expected_size);
+
+	info = (DimensionInfo*)(tmp);
+	tmp += ndims * sizeof(DimensionInfo);
+
+	/* account for the value arrays */
+	for (i = 0; i < ndims; i++)
+		expected_size += info[i].nbytes;
+
+	if (VARSIZE_ANY_EXHDR(data) != expected_size)
+		elog(ERROR, "invalid MCV Size %ld (expected %ld)",
+			 VARSIZE_ANY_EXHDR(data), expected_size);
+
+	/* looks OK - not corrupted or something */
+
+	/* let's parse the value arrays */
+	values = (Datum**)palloc0(sizeof(Datum*) * ndims);
+
+	/*
+	 * FIXME This uses pointers to the original data array (the types
+	 *       not passed by value), so when someone frees the memory,
+	 *       e.g. by doing something like this:
+	 *
+	 *           bytea * data = ... fetch the data from catalog ...
+	 *           MCVList mcvlist = deserialize_mcv_list(data);
+	 *           pfree(data);
+	 *
+	 *       then 'mcvlist' references the freed memory. This needs to
+	 *       copy the pieces.
+	 */
+	for (i = 0; i < ndims; i++)
+	{
+		if (info[i].typbyval)
+		{
+			/* passed by value / Datum - simply reuse the array */
+			values[i] = (Datum*)tmp;
+			tmp += info[i].nbytes;
+		}
+		else if (info[i].typlen > 0)
+		{
+			/* pased by reference, but fixed length (name, tid, ...) */
+			values[i] = (Datum*)palloc0(sizeof(Datum) * info[i].nvalues);
+			for (j = 0; j < info[i].nvalues; j++)
+			{
+				/* just point into the array */
+				values[i][j] = PointerGetDatum(tmp);
+				tmp += info[i].typlen;
+			}
+		}
+		else if (info[i].typlen == -1)
+		{
+			/* varlena */
+			values[i] = (Datum*)palloc0(sizeof(Datum) * info[i].nvalues);
+			for (j = 0; j < info[i].nvalues; j++)
+			{
+				/* just point into the array */
+				values[i][j] = PointerGetDatum(tmp);
+				tmp += VARSIZE_ANY(tmp);
+			}
+		}
+		else if (info[i].typlen == -2)
+		{
+			/* cstring */
+			values[i] = (Datum*)palloc0(sizeof(Datum) * info[i].nvalues);
+			for (j = 0; j < info[i].nvalues; j++)
+			{
+				/* just point into the array */
+				values[i][j] = PointerGetDatum(tmp);
+				tmp += (strlen(tmp) + 1); /* don't forget the \0 */
+			}
+		}
+	}
+
+	/* allocate space for the MCV items */
+	mcvlist->items = (MCVItem*)palloc0(sizeof(MCVItem) * nitems);
+
+	for (i = 0; i < nitems; i++)
+	{
+		MCVItem item = (MCVItem)palloc0(sizeof(MCVItemData));
+
+		item->values = (Datum*)palloc0(sizeof(Datum)*ndims);
+		item->isnull = (bool*) palloc0(sizeof(bool) *ndims);
+
+		/* just point to the right place */
+		indexes = ITEM_INDEXES(tmp);
+
+		memcpy(item->isnull, ITEM_NULLS(tmp, ndims), sizeof(bool) * ndims);
+		memcpy(&item->frequency, ITEM_FREQUENCY(tmp, ndims), sizeof(double));
+
+		/* translate the values */
+		for (j = 0; j < ndims; j++)
+			if (! item->isnull[j])
+				item->values[j] = values[j][indexes[j]];
+
+		mcvlist->items[i] = item;
+
+		tmp += ITEM_SIZE(ndims);
+
+		Assert(tmp <= (char*)data + VARSIZE_ANY(data));
+	}
+
+	/* check that we processed all the data */
+	Assert(tmp == (char*)data + VARSIZE_ANY(data));
+
+	return mcvlist;
+}
+
+/*
+ * We need to pass the SortSupport to the comparator, but bsearch()
+ * has no 'context' parameter, so we use a global variable (ugly).
+ */
+static int
+bsearch_comparator(const void * a, const void * b)
+{
+	Assert(ssup_private != NULL);
+	return compare_scalars_simple(a, b, (void*)ssup_private);
+}
diff --git a/src/include/catalog/pg_mv_statistic.h b/src/include/catalog/pg_mv_statistic.h
index 81ec23b..c6e7d74 100644
--- a/src/include/catalog/pg_mv_statistic.h
+++ b/src/include/catalog/pg_mv_statistic.h
@@ -35,15 +35,21 @@ CATALOG(pg_mv_statistic,3381)
 
 	/* statistics requested to build */
 	bool		deps_enabled;		/* analyze dependencies? */
+	bool		mcv_enabled;		/* build MCV list? */
+
+	/* MCV size */
+	int32		mcv_max_items;		/* max MCV items */
 
 	/* statistics that are available (if requested) */
 	bool		deps_built;			/* dependencies were built */
+	bool		mcv_built;			/* MCV list was built */
 
 	/* variable-length fields start here, but we allow direct access to stakeys */
 	int2vector	stakeys;			/* array of column keys */
 
 #ifdef CATALOG_VARLEN
 	bytea		stadeps;			/* dependencies (serialized) */
+	bytea		stamcv;				/* MCV list (serialized) */
 #endif
 
 } FormData_pg_mv_statistic;
@@ -59,11 +65,15 @@ typedef FormData_pg_mv_statistic *Form_pg_mv_statistic;
  *		compiler constants for pg_attrdef
  * ----------------
  */
-#define Natts_pg_mv_statistic					5
+#define Natts_pg_mv_statistic					9
 #define Anum_pg_mv_statistic_starelid			1
 #define Anum_pg_mv_statistic_deps_enabled		2
-#define Anum_pg_mv_statistic_deps_built			3
-#define Anum_pg_mv_statistic_stakeys			4
-#define Anum_pg_mv_statistic_stadeps			5
+#define Anum_pg_mv_statistic_mcv_enabled		3
+#define Anum_pg_mv_statistic_mcv_max_items		4
+#define Anum_pg_mv_statistic_deps_built			5
+#define Anum_pg_mv_statistic_mcv_built			6
+#define Anum_pg_mv_statistic_stakeys			7
+#define Anum_pg_mv_statistic_stadeps			8
+#define Anum_pg_mv_statistic_stamcv				9
 
 #endif   /* PG_MV_STATISTIC_H */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 2916f11..b2aa815 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2716,6 +2716,8 @@ DATA(insert OID = 3377 (  pg_mv_stats_dependencies_info     PGNSP PGUID 12 1 0 0
 DESCR("multivariate stats: functional dependencies info");
 DATA(insert OID = 3378 (  pg_mv_stats_dependencies_show     PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 25 "17" _null_ _null_ _null_ _null_ pg_mv_stats_dependencies_show _null_ _null_ _null_ ));
 DESCR("multivariate stats: functional dependencies show");
+DATA(insert OID = 3376 (  pg_mv_stats_mcvlist_info	PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 25 "17" _null_ _null_ _null_ _null_ pg_mv_stats_mcvlist_info _null_ _null_ _null_ ));
+DESCR("multi-variate statistics: MCV list info");
 
 DATA(insert OID = 1928 (  pg_stat_get_numscans			PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_numscans _null_ _null_ _null_ ));
 DESCR("statistics: number of scans done for table/index");
diff --git a/src/include/utils/mvstats.h b/src/include/utils/mvstats.h
index ec6764b..6ff29d6 100644
--- a/src/include/utils/mvstats.h
+++ b/src/include/utils/mvstats.h
@@ -25,9 +25,11 @@ typedef struct MVStatsData {
 
 	/* statistics requested in ALTER TABLE ... ADD STATISTICS */
 	bool		deps_enabled;	/* analyze functional dependencies */
+	bool		mcv_enabled;	/* analyze MCV lists */
 
 	/* available statistics (computed by ANALYZE) */
 	bool		deps_built;	/* functional dependencies available */
+	bool		mcv_built;	/* MCV list is already available */
 } MVStatsData;
 
 typedef struct MVStatsData *MVStats;
@@ -66,6 +68,47 @@ typedef MVDependenciesData* MVDependencies;
 #define MVSTAT_DEPS_TYPE_BASIC	1			/* basic dependencies type */
 
 /*
+ * Multivariate MCV (most-common value) lists
+ *
+ * A straight-forward extension of MCV items - i.e. a list (array) of
+ * combinations of attribute values, together with a frequency and
+ * null flags.
+ */
+typedef struct MCVItemData {
+	double		frequency;	/* frequency of this combination */
+	bool	   *isnull;		/* lags of NULL values (up to 32 columns) */
+	Datum	   *values;		/* variable-length (ndimensions) */
+} MCVItemData;
+
+typedef MCVItemData *MCVItem;
+
+/* multivariate MCV list - essentally an array of MCV items */
+typedef struct MCVListData {
+	uint32		magic;			/* magic constant marker */
+	uint32		type;			/* type of MCV list (BASIC) */
+	uint32		ndimensions;	/* number of dimensions */
+	uint32		nitems;			/* number of MCV items in the array */
+	MCVItem	   *items;			/* array of MCV items */
+} MCVListData;
+
+typedef MCVListData *MCVList;
+
+/* used to flag stats serialized to bytea */
+#define MVSTAT_MCV_MAGIC		0xE1A651C2	/* marks serialized bytea */
+#define MVSTAT_MCV_TYPE_BASIC	1			/* basic MCV list type */
+
+/*
+ * Limits used for mcv_max_items option, i.e. we're always guaranteed
+ * to have space for at least MVSTAT_MCVLIST_MIN_ITEMS, and we cannot
+ * have more than MVSTAT_MCVLIST_MAX_ITEMS items.
+ *
+ * This is just a boundary for the 'max' threshold - the actual list
+ * may of course contain less items than MVSTAT_MCVLIST_MIN_ITEMS.
+ */
+#define MVSTAT_MCVLIST_MIN_ITEMS	128		/* min items in MCV list */
+#define MVSTAT_MCVLIST_MAX_ITEMS	8192	/* max items in MCV list */
+
+/*
  * TODO Maybe fetching the histogram/MCV list separately is inefficient?
  *      Consider adding a single `fetch_stats` method, fetching all
  *      stats specified using flags (or something like that).
@@ -74,24 +117,39 @@ MVStats list_mv_stats(Oid relid, int *nstats, bool built_only);
 bytea * fetch_mv_rules(Oid mvoid);
 
 bytea * fetch_mv_dependencies(Oid mvoid);
+bytea * fetch_mv_mcvlist(Oid mvoid);
 
 bytea * serialize_mv_dependencies(MVDependencies dependencies);
+bytea * serialize_mv_mcvlist(MCVList mcvlist, int2vector *attrs,
+							 VacAttrStats **stats);
 
 /* deserialization of stats (serialization is private to analyze) */
 MVDependencies	deserialize_mv_dependencies(bytea * data);
+MCVList			deserialize_mv_mcvlist(bytea * data);
+
+/*
+ * Returns index of the attribute number within the vector (i.e. a
+ * dimension within the stats).
+ */
+int mv_get_index(AttrNumber varattno, int2vector * stakeys);
 
 /* FIXME this probably belongs somewhere else (not to operations stats) */
 extern Datum pg_mv_stats_dependencies_info(PG_FUNCTION_ARGS);
 extern Datum pg_mv_stats_dependencies_show(PG_FUNCTION_ARGS);
+extern Datum pg_mv_stats_mcvlist_info(PG_FUNCTION_ARGS);
 
 MVDependencies
-build_mv_dependencies(int numrows, HeapTuple *rows,
-								  int2vector *attrs,
-								  VacAttrStats **stats);
+build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
+					  VacAttrStats **stats);
+
+MCVList
+build_mv_mcvlist(int numrows, HeapTuple *rows, int2vector *attrs,
+				 VacAttrStats **stats, int *numrows_filtered);
 
 void build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
-						   int natts, VacAttrStats **vacattrstats);
+					int natts, VacAttrStats **vacattrstats);
 
-void update_mv_stats(Oid relid, MVDependencies dependencies);
+void update_mv_stats(Oid relid, MVDependencies dependencies, MCVList mcvlist,
+					 int2vector *attrs, VacAttrStats **stats);
 
 #endif
diff --git a/src/test/regress/expected/mv_mcv.out b/src/test/regress/expected/mv_mcv.out
new file mode 100644
index 0000000..595cfbf
--- /dev/null
+++ b/src/test/regress/expected/mv_mcv.out
@@ -0,0 +1,210 @@
+-- data type passed by value
+CREATE TABLE mcv_list (
+    a INT,
+    b INT,
+    c INT
+);
+-- unknown column
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (unknown_column);
+ERROR:  column "unknown_column" referenced in statistics does not exist
+-- single column
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a);
+ERROR:  multivariate stats require 2 or more columns
+-- single column, duplicated
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, a);
+ERROR:  duplicate column name in statistics definition
+-- two columns, one duplicated
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, a, b);
+ERROR:  duplicate column name in statistics definition
+-- unknown option
+ALTER TABLE mcv_list ADD STATISTICS (unknown_option) ON (a, b, c);
+ERROR:  unrecognized STATISTICS option "unknown_option"
+-- missing MCV statistics
+ALTER TABLE mcv_list ADD STATISTICS (dependencies, max_mcv_items 200) ON (a, b, c);
+ERROR:  option 'mcv' is required by other options(s)
+-- invalid mcv_max_items value / too low
+ALTER TABLE mcv_list ADD STATISTICS (mcv, max_mcv_items 10) ON (a, b, c);
+ERROR:  max number of MCV items must be at least 128
+-- invalid mcv_max_items value / too high
+ALTER TABLE mcv_list ADD STATISTICS (mcv, max_mcv_items 10000) ON (a, b, c);
+ERROR:  max number of MCV items is 8192
+-- correct command
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c);
+-- random data
+INSERT INTO mcv_list
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | f         | 
+(1 row)
+
+TRUNCATE mcv_list;
+-- a => b, a => c, b => c
+INSERT INTO mcv_list
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=1000
+(1 row)
+
+TRUNCATE mcv_list;
+-- a => b, a => c
+INSERT INTO mcv_list
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=1000
+(1 row)
+
+TRUNCATE mcv_list;
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO mcv_list
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX mcv_idx ON mcv_list (a, b);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=100
+(1 row)
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a = 10 AND b = 5;
+                 QUERY PLAN                 
+--------------------------------------------
+ Bitmap Heap Scan on mcv_list
+   Recheck Cond: ((a = 10) AND (b = 5))
+   ->  Bitmap Index Scan on mcv_idx
+         Index Cond: ((a = 10) AND (b = 5))
+(4 rows)
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
+-- varlena type (text)
+CREATE TABLE mcv_list (
+    a TEXT,
+    b TEXT,
+    c TEXT
+);
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c);
+-- random data
+INSERT INTO mcv_list
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | f         | 
+(1 row)
+
+TRUNCATE mcv_list;
+-- a => b, a => c, b => c
+INSERT INTO mcv_list
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=1000
+(1 row)
+
+TRUNCATE mcv_list;
+-- a => b, a => c
+INSERT INTO mcv_list
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=1000
+(1 row)
+
+TRUNCATE mcv_list;
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO mcv_list
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX mcv_idx ON mcv_list (a, b);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=100
+(1 row)
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a = '10' AND b = '5';
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Bitmap Heap Scan on mcv_list
+   Recheck Cond: ((a = '10'::text) AND (b = '5'::text))
+   ->  Bitmap Index Scan on mcv_idx
+         Index Cond: ((a = '10'::text) AND (b = '5'::text))
+(4 rows)
+
+TRUNCATE mcv_list;
+-- check explain (expect bitmap index scan, not plain index scan) with NULLs
+INSERT INTO mcv_list
+     SELECT
+       (CASE WHEN i/10000 = 0 THEN NULL ELSE i/10000 END),
+       (CASE WHEN i/20000 = 0 THEN NULL ELSE i/20000 END),
+       (CASE WHEN i/40000 = 0 THEN NULL ELSE i/40000 END)
+     FROM generate_series(1,1000000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=100
+(1 row)
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a IS NULL AND b IS NULL;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Bitmap Heap Scan on mcv_list
+   Recheck Cond: ((a IS NULL) AND (b IS NULL))
+   ->  Bitmap Index Scan on mcv_idx
+         Index Cond: ((a IS NULL) AND (b IS NULL))
+(4 rows)
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
+-- NULL values (mix of int and text columns)
+CREATE TABLE mcv_list (
+    a INT,
+    b TEXT,
+    c INT,
+    d TEXT
+);
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c, d);
+INSERT INTO mcv_list
+     SELECT
+         mod(i, 100),
+         (CASE WHEN mod(i, 200) = 0 THEN NULL ELSE mod(i,200) END),
+         mod(i, 400),
+         (CASE WHEN mod(i, 300) = 0 THEN NULL ELSE mod(i,600) END)
+     FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+ mcv_enabled | mcv_built | pg_mv_stats_mcvlist_info 
+-------------+-----------+--------------------------
+ t           | t         | nitems=1200
+(1 row)
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index f0117ca..6d9ab2f 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1357,7 +1357,9 @@ pg_mv_stats| SELECT n.nspname AS schemaname,
     c.relname AS tablename,
     s.stakeys AS attnums,
     length(s.stadeps) AS depsbytes,
-    pg_mv_stats_dependencies_info(s.stadeps) AS depsinfo
+    pg_mv_stats_dependencies_info(s.stadeps) AS depsinfo,
+    length(s.stamcv) AS mcvbytes,
+    pg_mv_stats_mcvlist_info(s.stamcv) AS mcvinfo
    FROM ((pg_mv_statistic s
      JOIN pg_class c ON ((c.oid = s.starelid)))
      LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)));
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 00c6ddf..63727a4 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -111,4 +111,4 @@ test: event_trigger
 test: stats
 
 # run tests of multivariate stats
-test: mv_dependencies
+test: mv_dependencies mv_mcv
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index b818be9..5b07b3b 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -154,3 +154,4 @@ test: xml
 test: event_trigger
 test: stats
 test: mv_dependencies
+test: mv_mcv
diff --git a/src/test/regress/sql/mv_mcv.sql b/src/test/regress/sql/mv_mcv.sql
new file mode 100644
index 0000000..410b52d
--- /dev/null
+++ b/src/test/regress/sql/mv_mcv.sql
@@ -0,0 +1,181 @@
+-- data type passed by value
+CREATE TABLE mcv_list (
+    a INT,
+    b INT,
+    c INT
+);
+
+-- unknown column
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (unknown_column);
+
+-- single column
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a);
+
+-- single column, duplicated
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, a);
+
+-- two columns, one duplicated
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, a, b);
+
+-- unknown option
+ALTER TABLE mcv_list ADD STATISTICS (unknown_option) ON (a, b, c);
+
+-- missing MCV statistics
+ALTER TABLE mcv_list ADD STATISTICS (dependencies, max_mcv_items 200) ON (a, b, c);
+
+-- invalid mcv_max_items value / too low
+ALTER TABLE mcv_list ADD STATISTICS (mcv, max_mcv_items 10) ON (a, b, c);
+
+-- invalid mcv_max_items value / too high
+ALTER TABLE mcv_list ADD STATISTICS (mcv, max_mcv_items 10000) ON (a, b, c);
+
+-- correct command
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c);
+
+-- random data
+INSERT INTO mcv_list
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- a => b, a => c, b => c
+INSERT INTO mcv_list
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- a => b, a => c
+INSERT INTO mcv_list
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO mcv_list
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX mcv_idx ON mcv_list (a, b);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a = 10 AND b = 5;
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
+
+-- varlena type (text)
+CREATE TABLE mcv_list (
+    a TEXT,
+    b TEXT,
+    c TEXT
+);
+
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c);
+
+-- random data
+INSERT INTO mcv_list
+     SELECT mod(i, 111), mod(i, 123), mod(i, 23) FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- a => b, a => c, b => c
+INSERT INTO mcv_list
+     SELECT i/10, i/100, i/200 FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- a => b, a => c
+INSERT INTO mcv_list
+     SELECT i/10, i/150, i/200 FROM generate_series(1,10000) s(i);
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+TRUNCATE mcv_list;
+
+-- check explain (expect bitmap index scan, not plain index scan)
+INSERT INTO mcv_list
+     SELECT i/10000, i/20000, i/40000 FROM generate_series(1,1000000) s(i);
+CREATE INDEX mcv_idx ON mcv_list (a, b);
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a = '10' AND b = '5';
+
+TRUNCATE mcv_list;
+
+-- check explain (expect bitmap index scan, not plain index scan) with NULLs
+INSERT INTO mcv_list
+     SELECT
+       (CASE WHEN i/10000 = 0 THEN NULL ELSE i/10000 END),
+       (CASE WHEN i/20000 = 0 THEN NULL ELSE i/20000 END),
+       (CASE WHEN i/40000 = 0 THEN NULL ELSE i/40000 END)
+     FROM generate_series(1,1000000) s(i);
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+EXPLAIN (COSTS off)
+ SELECT * FROM mcv_list WHERE a IS NULL AND b IS NULL;
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
+
+-- NULL values (mix of int and text columns)
+CREATE TABLE mcv_list (
+    a INT,
+    b TEXT,
+    c INT,
+    d TEXT
+);
+
+ALTER TABLE mcv_list ADD STATISTICS (mcv) ON (a, b, c, d);
+
+INSERT INTO mcv_list
+     SELECT
+         mod(i, 100),
+         (CASE WHEN mod(i, 200) = 0 THEN NULL ELSE mod(i,200) END),
+         mod(i, 400),
+         (CASE WHEN mod(i, 300) = 0 THEN NULL ELSE mod(i,600) END)
+     FROM generate_series(1,10000) s(i);
+
+ANALYZE mcv_list;
+
+SELECT mcv_enabled, mcv_built, pg_mv_stats_mcvlist_info(stamcv)
+  FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+
+DELETE FROM pg_mv_statistic WHERE starelid = 'mcv_list'::regclass;
+DROP TABLE mcv_list;
-- 
2.0.5

