From c66c9cd2d5ec3c3433e6c9a8b3477b274468442a Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Thu, 3 Aug 2017 21:55:10 +0200
Subject: [PATCH 1/3] Multivariate MCV list statistics

---
 doc/src/sgml/catalogs.sgml                       |   10 +
 doc/src/sgml/planstats.sgml                      |  139 ++
 doc/src/sgml/ref/create_statistics.sgml          |   32 +-
 src/backend/commands/statscmds.c                 |  107 +-
 src/backend/optimizer/path/clausesel.c           |   10 +
 src/backend/optimizer/util/plancat.c             |   12 +
 src/backend/statistics/Makefile                  |    2 +-
 src/backend/statistics/README.mcv                |  137 ++
 src/backend/statistics/dependencies.c            |   74 +-
 src/backend/statistics/extended_stats.c          |  215 ++-
 src/backend/statistics/mcv.c                     | 1809 ++++++++++++++++++++++
 src/backend/utils/adt/ruleutils.c                |   24 +-
 src/bin/psql/describe.c                          |    9 +-
 src/include/catalog/pg_cast.h                    |    5 +
 src/include/catalog/pg_proc.h                    |   12 +
 src/include/catalog/pg_statistic_ext.h           |    5 +-
 src/include/catalog/pg_type.h                    |    4 +
 src/include/statistics/extended_stats_internal.h |   34 +-
 src/include/statistics/statistics.h              |   47 +
 src/test/regress/expected/opr_sanity.out         |    3 +-
 src/test/regress/expected/stats_ext.out          |  219 ++-
 src/test/regress/expected/type_sanity.out        |    3 +-
 src/test/regress/sql/stats_ext.sql               |  121 ++
 23 files changed, 2957 insertions(+), 76 deletions(-)
 create mode 100644 src/backend/statistics/README.mcv
 create mode 100644 src/backend/statistics/mcv.c

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index ef7054c..e07fe46 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -6468,6 +6468,16 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</>:<replaceable>&lt;salt&gt;<
       </entry>
      </row>
 
+     <row>
+      <entry><structfield>stxmcv</structfield></entry>
+      <entry><type>pg_mcv_list</type></entry>
+      <entry></entry>
+      <entry>
+       MCV (most-common values) list statistics, serialized as
+       <structname>pg_mcv_list</> type.
+      </entry>
+     </row>
+
     </tbody>
    </tgroup>
   </table>
diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml
index 838fcda..1e81d94 100644
--- a/doc/src/sgml/planstats.sgml
+++ b/doc/src/sgml/planstats.sgml
@@ -585,6 +585,145 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b;
    </para>
 
   </sect2>
+
+  <sect2 id="mcv-lists">
+   <title>MCV lists</title>
+
+   <para>
+    As explained in the previous section, functional dependencies are very
+    cheap and efficient type of statistics, but it has limitations due to the
+    global nature (only tracking column-level dependencies, not between values
+    stored in the columns).
+   </para>
+
+   <para>
+    This section introduces multivariate most-common values (<acronym>MCV</>)
+    lists, a direct generalization of the statistics introduced in
+    <xref linkend="row-estimation-examples">, that is not subject to this
+    limitation. It is however more expensive, both in terms of storage and
+    planning time.
+   </para>
+
+   <para>
+    Let's look at the example query from the previous section again, creating
+    a multivariate <acronym>MCV</> list on the columns (after dropping the
+    functional dependencies, to make sure the planner uses the newly created
+    <acronym>MCV</> list when computing the estimates).
+
+<programlisting>
+CREATE STATISTICS stts2 (mcv) ON a, b FROM t;
+ANALYZE t;
+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 1;
+                                           QUERY PLAN
+-------------------------------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..195.00 rows=100 width=8) (actual time=0.036..3.011 rows=100 loops=1)
+   Filter: ((a = 1) AND (b = 1))
+   Rows Removed by Filter: 9900
+ Planning time: 0.188 ms
+ Execution time: 3.229 ms
+(5 rows)
+</programlisting>
+
+    The estimate is as accurate as with the functional dependencies, mostly
+    thanks to the table being a fairly small and having a simple distribution
+    with low number of distinct values. Before looking at the second query,
+    which was not handled by functional dependencies this well, let's inspect
+    the <acronym>MCV</> list a bit.
+   </para>
+
+   <para>
+    First, let's list statistics defined on a table using <command>\d</>
+    in <application>psql</>:
+
+<programlisting>
+\d t
+       Table "public.t"
+ Column |  Type   | Modifiers
+--------+---------+-----------
+ a      | integer |
+ b      | integer |
+Statistics objects:
+    "public"."stts2" (mcv) ON a, b FROM t
+</programlisting>
+
+   </para>
+
+   <para>
+    Inspecting the contents of the MCV list is possible using
+    <function>pg_mcv_list_items</> set-returning function.
+
+<programlisting>
+SELECT * FROM pg_mcv_list_items((SELECT oid FROM pg_statistic_ext WHERE staname = 'stts2'));
+ index | values  | nulls | frequency
+-------+---------+-------+-----------
+     0 | {0,0}   | {f,f} |      0.01
+     1 | {1,1}   | {f,f} |      0.01
+     2 | {2,2}   | {f,f} |      0.01
+...
+    49 | {49,49} | {f,f} |      0.01
+    50 | {50,0}  | {f,f} |      0.01
+...
+    97 | {97,47} | {f,f} |      0.01
+    98 | {98,48} | {f,f} |      0.01
+    99 | {99,49} | {f,f} |      0.01
+(100 rows)
+</programlisting>
+
+    Which confirms there are 100 distinct combinations of values in the two
+    columns, and all of them are equally likely (1% frequency for each).
+    Had there been any null values in either of the columns, this would be
+    identified in the <structfield>nulls</> column.
+   </para>
+
+   <para>
+    When estimating the selectivity, the planner applies all the conditions
+    on items in the <acronym>MCV</> list, and them sums the frequencies
+    of the matching ones. See <function>clauselist_mv_selectivity_mcvlist</>
+    in <filename>clausesel.c</> for details.
+   </para>
+
+   <para>
+    Compared to functional dependencies, <acronym>MCV</> lists have two major
+    advantages. Firstly, the list stores actual values, making it possible to
+    detect "incompatible" combinations.
+
+<programlisting>
+EXPLAIN ANALYZE SELECT * FROM t WHERE a = 1 AND b = 10;
+                                         QUERY PLAN
+---------------------------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..195.00 rows=1 width=8) (actual time=2.823..2.823 rows=0 loops=1)
+   Filter: ((a = 1) AND (b = 10))
+   Rows Removed by Filter: 10000
+ Planning time: 0.268 ms
+ Execution time: 2.866 ms
+(5 rows)
+</programlisting>
+
+    Secondly, <acronym>MCV</> also handle a wide range of clause types, not
+    just equality clauses like functional dependencies. See for example the
+    example range query, presented earlier:
+
+<programlisting>
+EXPLAIN ANALYZE SELECT * FROM t WHERE a <= 49 AND b > 49;
+                                         QUERY PLAN
+---------------------------------------------------------------------------------------------
+ Seq Scan on t  (cost=0.00..195.00 rows=1 width=8) (actual time=3.349..3.349 rows=0 loops=1)
+   Filter: ((a <= 49) AND (b > 49))
+   Rows Removed by Filter: 10000
+ Planning time: 0.163 ms
+ Execution time: 3.389 ms
+(5 rows)
+</programlisting>
+
+   </para>
+
+   <para>
+    For additional information about multivariate MCV lists, see
+    <filename>src/backend/statistics/README.mcv</>.
+   </para>
+
+  </sect2>
+
  </sect1>
 
  <sect1 id="planner-stats-security">
diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml
index deda21f..52851da 100644
--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -81,9 +81,10 @@ CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="PARAMETER">statistics_na
      <para>
       A statistic type to be computed in this statistics object.
       Currently supported types are
-      <literal>ndistinct</literal>, which enables n-distinct statistics, and
-      <literal>dependencies</literal>, which enables functional
-      dependency statistics.
+      <literal>ndistinct</literal>, which enables n-distinct statistics,
+      <literal>dependencies</literal>, which enables functional dependency
+      statistics, and <literal>mcv</literal> which enables most-common
+      values lists.
       If this clause is omitted, all supported statistic types are
       included in the statistics object.
       For more information, see <xref linkend="planner-stats-extended">
@@ -164,6 +165,31 @@ EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
    conditions are redundant and does not underestimate the rowcount.
   </para>
 
+  <para>
+   Create table <structname>t2</> with two perfectly correlated columns
+   (containing identical data), and a MCV list on those columns:
+
+<programlisting>
+CREATE TABLE t2 (
+    a   int,
+    b   int
+);
+
+INSERT INTO t2 SELECT mod(i,100), mod(i,100)
+                 FROM generate_series(1,1000000) s(i);
+
+CREATE STATISTICS s2 WITH (mcv) ON (a, b) FROM t2;
+
+ANALYZE t2;
+
+-- valid combination (found in MCV)
+EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 1);
+
+-- invalid combination (not found in MCV)
+EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 2);
+</programlisting>
+  </para>
+
  </refsect1>
 
  <refsect1>
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
index 4765055..0bcea4b 100644
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -64,11 +64,12 @@ CreateStatistics(CreateStatsStmt *stmt)
 	Oid			relid;
 	ObjectAddress parentobject,
 				myself;
-	Datum		types[2];		/* one for each possible type of statistic */
+	Datum		types[3];		/* one for each possible type of statistic */
 	int			ntypes;
 	ArrayType  *stxkind;
 	bool		build_ndistinct;
 	bool		build_dependencies;
+	bool		build_mcv;
 	bool		requested_type = false;
 	int			i;
 	ListCell   *cell;
@@ -246,6 +247,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 	 */
 	build_ndistinct = false;
 	build_dependencies = false;
+	build_mcv = false;
 	foreach(cell, stmt->stat_types)
 	{
 		char	   *type = strVal((Value *) lfirst(cell));
@@ -260,6 +262,11 @@ CreateStatistics(CreateStatsStmt *stmt)
 			build_dependencies = true;
 			requested_type = true;
 		}
+		else if (strcmp(type, "mcv") == 0)
+		{
+			build_mcv = true;
+			requested_type = true;
+		}
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
@@ -271,6 +278,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 	{
 		build_ndistinct = true;
 		build_dependencies = true;
+		build_mcv = true;
 	}
 
 	/* construct the char array of enabled statistic types */
@@ -279,6 +287,8 @@ CreateStatistics(CreateStatsStmt *stmt)
 		types[ntypes++] = CharGetDatum(STATS_EXT_NDISTINCT);
 	if (build_dependencies)
 		types[ntypes++] = CharGetDatum(STATS_EXT_DEPENDENCIES);
+	if (build_mcv)
+		types[ntypes++] = CharGetDatum(STATS_EXT_MCV);
 	Assert(ntypes > 0 && ntypes <= lengthof(types));
 	stxkind = construct_array(types, ntypes, CHAROID, 1, true, 'c');
 
@@ -297,6 +307,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 	/* no statistics built yet */
 	nulls[Anum_pg_statistic_ext_stxndistinct - 1] = true;
 	nulls[Anum_pg_statistic_ext_stxdependencies - 1] = true;
+	nulls[Anum_pg_statistic_ext_stxmcv - 1] = true;
 
 	/* insert it into pg_statistic_ext */
 	statrel = heap_open(StatisticExtRelationId, RowExclusiveLock);
@@ -387,21 +398,95 @@ RemoveStatisticsById(Oid statsOid)
  * null until the next ANALYZE.  (Note that the type change hasn't actually
  * happened yet, so one option that's *not* on the table is to recompute
  * immediately.)
+ *
+ * For both ndistinct and functional-dependencies stats, the on-disk
+ * representation is independent of the source column data types, and it is
+ * plausible to assume that the old statistic values will still be good for
+ * the new column contents.  (Obviously, if the ALTER COLUMN TYPE has a USING
+ * expression that substantially alters the semantic meaning of the column
+ * values, this assumption could fail.  But that seems like a corner case
+ * that doesn't justify zapping the stats in common cases.)
+ *
+ * For MCV lists that's not the case, as those statistics store the datums
+ * internally. In this case we simply reset the statistics value to NULL.
  */
 void
 UpdateStatisticsForTypeChange(Oid statsOid, Oid relationOid, int attnum,
 							  Oid oldColumnType, Oid newColumnType)
 {
+	Form_pg_statistic_ext staForm;
+	HeapTuple	stup,
+				oldtup;
+	int			i;
+
+	/* Do we need to reset anything? */
+	bool		attribute_referenced;
+	bool		reset_stats = false;
+
+	Relation	rel;
+
+	Datum		values[Natts_pg_statistic_ext];
+	bool		nulls[Natts_pg_statistic_ext];
+	bool		replaces[Natts_pg_statistic_ext];
+
+	oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statsOid));
+	if (!oldtup)
+		elog(ERROR, "cache lookup failed for statistics object %u", statsOid);
+	staForm = (Form_pg_statistic_ext) GETSTRUCT(oldtup);
+
+	/*
+	 * If the modified attribute is not referenced by this statistic, we
+	 * can simply leave the statistics alone.
+	 */
+	attribute_referenced = false;
+	for (i = 0; i < staForm->stxkeys.dim1; i++)
+		if (attnum == staForm->stxkeys.values[i])
+			attribute_referenced = true;
+
 	/*
-	 * Currently, we don't actually need to do anything here.  For both
-	 * ndistinct and functional-dependencies stats, the on-disk representation
-	 * is independent of the source column data types, and it is plausible to
-	 * assume that the old statistic values will still be good for the new
-	 * column contents.  (Obviously, if the ALTER COLUMN TYPE has a USING
-	 * expression that substantially alters the semantic meaning of the column
-	 * values, this assumption could fail.  But that seems like a corner case
-	 * that doesn't justify zapping the stats in common cases.)
-	 *
-	 * Future types of extended stats will likely require us to work harder.
+	 * We can also leave the record as it is if there are no statistics
+	 * including the datum values, like for example MCV lists.
 	 */
+	if (statext_is_kind_built(oldtup, STATS_EXT_MCV))
+		reset_stats = true;
+
+	/*
+	 * If we can leave the statistics as it is, just do minimal cleanup
+	 * and we're done.
+	 */
+	if (!attribute_referenced && reset_stats)
+	{
+		ReleaseSysCache(oldtup);
+		return;
+	}
+
+	/*
+	 * OK, we need to reset some statistics. So let's build the new tuple,
+	 * replacing the affected statistics types with NULL.
+	 */
+	memset(nulls, 1, Natts_pg_statistic_ext * sizeof(bool));
+	memset(replaces, 0, Natts_pg_statistic_ext * sizeof(bool));
+	memset(values, 0, Natts_pg_statistic_ext * sizeof(Datum));
+
+	if (statext_is_kind_built(oldtup, STATS_EXT_MCV))
+	{
+		replaces[Anum_pg_statistic_ext_stxmcv - 1] = true;
+		nulls[Anum_pg_statistic_ext_stxmcv - 1] = true;
+	}
+
+	rel = heap_open(StatisticExtRelationId, RowExclusiveLock);
+
+	/* replace the old tuple */
+	stup = heap_modify_tuple(oldtup,
+							 RelationGetDescr(rel),
+							 values,
+							 nulls,
+							 replaces);
+
+	ReleaseSysCache(oldtup);
+	CatalogTupleUpdate(rel, &stup->t_self, stup);
+
+	heap_freetuple(stup);
+
+	heap_close(rel, RowExclusiveLock);
 }
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index 9d34025..28a9321 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -125,6 +125,16 @@ clauselist_selectivity(PlannerInfo *root,
 	if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL)
 	{
 		/*
+		 * Perform selectivity estimations on any clauses applicable by
+		 * mcv_clauselist_selectivity.  'estimatedclauses' will be filled with
+		 * the 0-based list positions of clauses used that way, so that we can
+		 * ignore them below.
+		 */
+		s1 *= mcv_clauselist_selectivity(root, clauses, varRelid,
+										 jointype, sjinfo, rel,
+										 &estimatedclauses);
+
+		/*
 		 * Perform selectivity estimations on any clauses found applicable by
 		 * dependencies_clauselist_selectivity.  'estimatedclauses' will be
 		 * filled with the 0-based list positions of clauses used that way, so
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index dc0b0b0..ab2c8c2 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -1321,6 +1321,18 @@ get_relation_statistics(RelOptInfo *rel, Relation relation)
 			stainfos = lcons(info, stainfos);
 		}
 
+		if (statext_is_kind_built(htup, STATS_EXT_MCV))
+		{
+			StatisticExtInfo *info = makeNode(StatisticExtInfo);
+
+			info->statOid = statOid;
+			info->rel = rel;
+			info->kind = STATS_EXT_MCV;
+			info->keys = bms_copy(keys);
+
+			stainfos = lcons(info, stainfos);
+		}
+
 		ReleaseSysCache(htup);
 		bms_free(keys);
 	}
diff --git a/src/backend/statistics/Makefile b/src/backend/statistics/Makefile
index 3404e45..d281526 100644
--- a/src/backend/statistics/Makefile
+++ b/src/backend/statistics/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/statistics
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = extended_stats.o dependencies.o mvdistinct.o
+OBJS = extended_stats.o dependencies.o mcv.o mvdistinct.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/statistics/README.mcv b/src/backend/statistics/README.mcv
new file mode 100644
index 0000000..22c2b87
--- /dev/null
+++ b/src/backend/statistics/README.mcv
@@ -0,0 +1,137 @@
+MCV lists
+=========
+
+Multivariate MCV (most-common values) lists are a straightforward extension of
+regular MCV list, tracking most frequent combinations of values for a group of
+attributes.
+
+This works particularly well for columns with a small number of distinct values,
+as the list may include all the combinations and approximate the distribution
+very accurately.
+
+For columns with large number of distinct values (e.g. those with continuous
+domains), the list will only track the most frequent combinations. If the
+distribution is mostly uniform (all combinations about equally frequent), the
+MCV list will be empty.
+
+Estimates of some clauses (e.g. equality) based on MCV lists are more accurate
+than when using histograms.
+
+Also, MCV lists don't necessarily require sorting of the values (the fact that
+we use sorting when building them is implementation detail), but even more
+importantly the ordering is not built into the approximation (while histograms
+are built on ordering). So MCV lists work well even for attributes where the
+ordering of the data type is disconnected from the meaning of the data. For
+example we know how to sort strings, but it's unlikely to make much sense for
+city names (or other label-like attributes).
+
+
+Selectivity estimation
+----------------------
+
+The estimation, implemented in clauselist_mv_selectivity_mcvlist(), is quite
+simple in principle - we need to identify MCV items matching all the clauses
+and sum frequencies of all those items.
+
+Currently MCV lists support estimation of the following clause types:
+
+    (a) equality clauses    WHERE (a = 1) AND (b = 2)
+    (b) inequality clauses  WHERE (a < 1) AND (b >= 2)
+    (c) NULL clauses        WHERE (a IS NULL) AND (b IS NOT NULL)
+    (d) OR clauses          WHERE (a < 1) OR (b >= 2)
+
+It's possible to add support for additional clauses, for example:
+
+    (e) multi-var clauses   WHERE (a > b)
+
+and possibly others. These are tasks for the future, not yet implemented.
+
+
+Estimating equality clauses
+---------------------------
+
+When computing selectivity estimate for equality clauses
+
+    (a = 1) AND (b = 2)
+
+we can do this estimate pretty exactly assuming that two conditions are met:
+
+    (1) there's an equality condition on all attributes of the statistic
+
+    (2) we find a matching item in the MCV list
+
+In this case we know the MCV item represents all tuples matching the clauses,
+and the selectivity estimate is complete (i.e. we don't need to perform
+estimation using the histogram). This is what we call 'full match'.
+
+When only (1) holds, but there's no matching MCV item, we don't know whether
+there are no such rows or just are not very frequent. We can however use the
+frequency of the least frequent MCV item as an upper bound for the selectivity.
+
+For a combination of equality conditions (not full-match case) we can clamp the
+selectivity by the minimum of selectivities for each condition. For example if
+we know the number of distinct values for each column, we can use 1/ndistinct
+as a per-column estimate. Or rather 1/ndistinct + selectivity derived from the
+MCV list.
+
+We should also probably only use the 'residual ndistinct' by exluding the items
+included in the MCV list (and also residual frequency):
+
+     f = (1.0 - sum(MCV frequencies)) / (ndistinct - ndistinct(MCV list))
+
+but it's worth pointing out the ndistinct values are multi-variate for the
+columns referenced by the equality conditions.
+
+Note: Only the "full match" limit is currently implemented.
+
+
+Hashed MCV (not yet implemented)
+--------------------------------
+
+Regular MCV lists have to include actual values for each item, so if those items
+are large the list may be quite large. This is especially true for multi-variate
+MCV lists, although the current implementation partially mitigates this by
+performing de-duplicating the values before storing them on disk.
+
+It's possible to only store hashes (32-bit values) instead of the actual values,
+significantly reducing the space requirements. Obviously, this would only make
+the MCV lists useful for estimating equality conditions (assuming the 32-bit
+hashes make the collisions rare enough).
+
+This might also complicate matching the columns to available stats.
+
+
+TODO Consider implementing hashed MCV list, storing just 32-bit hashes instead
+     of the actual values. This type of MCV list will be useful only for
+     estimating equality clauses, and will reduce space requirements for large
+     varlena types (in such cases we usually only want equality anyway).
+
+TODO Currently there's no logic to consider building only a MCV list (and not
+     building the histogram at all), except for doing this decision manually in
+     ADD STATISTICS.
+
+
+Inspecting the MCV list
+-----------------------
+
+Inspecting the regular (per-attribute) MCV lists is trivial, as it's enough
+to select the columns from pg_stats - the data is encoded as anyarrays, so we
+simply get the text representation of the arrays.
+
+With multivariate MCV lits it's not that simple due to the possible mix of
+data types. It might be possible to produce similar array-like representation,
+but that'd unnecessarily complicate further processing and analysis of the MCV
+list. Instead, there's a SRF function providing values, frequencies etc.
+
+    SELECT * FROM pg_mcv_list_items();
+
+It has two input parameters:
+
+    oid   - OID of the MCV list (pg_statistic_ext.staoid)
+
+and produces a table with these columns:
+
+    - item ID (0...nitems-1)
+    - values (string array)
+    - nulls only (boolean array)
+    - frequency (double precision)
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
index 2e7c0ad..27e096f 100644
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -201,14 +201,11 @@ static double
 dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
 				  VacAttrStats **stats, Bitmapset *attrs)
 {
-	int			i,
-				j;
-	int			nvalues = numrows * k;
+	int			i;
 	MultiSortSupport mss;
 	SortItem   *items;
-	Datum	   *values;
-	bool	   *isnull;
 	int		   *attnums;
+	int		   *attnums_dep;
 
 	/* counters valid within a group */
 	int			group_size = 0;
@@ -223,26 +220,16 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
 	/* sort info for all attributes columns */
 	mss = multi_sort_init(k);
 
-	/* data for the sort */
-	items = (SortItem *) palloc(numrows * sizeof(SortItem));
-	values = (Datum *) palloc(sizeof(Datum) * nvalues);
-	isnull = (bool *) palloc(sizeof(bool) * nvalues);
-
-	/* fix the pointers to values/isnull */
-	for (i = 0; i < numrows; i++)
-	{
-		items[i].values = &values[i * k];
-		items[i].isnull = &isnull[i * k];
-	}
-
 	/*
-	 * Transform the bms into an array, to make accessing i-th member easier.
+	 * Transform the bms into an array, to make accessing i-th member easier,
+	 * and then construct a filtered version with only attnums referenced
+	 * by the dependency we validate.
 	 */
-	attnums = (int *) palloc(sizeof(int) * bms_num_members(attrs));
-	i = 0;
-	j = -1;
-	while ((j = bms_next_member(attrs, j)) >= 0)
-		attnums[i++] = j;
+	attnums = build_attnums(attrs);
+
+	attnums_dep = (int *)palloc(k * sizeof(int));
+	for (i = 0; i < k; i++)
+		attnums_dep[i] = attnums[dependency[i]];
 
 	/*
 	 * Verify the dependency (a,b,...)->z, using a rather simple algorithm:
@@ -254,7 +241,7 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
 	 * (c) for each group count different values in the last column
 	 */
 
-	/* prepare the sort function for the first dimension, and SortItem array */
+	/* prepare the sort function for the dimensions */
 	for (i = 0; i < k; i++)
 	{
 		VacAttrStats *colstat = stats[dependency[i]];
@@ -267,19 +254,16 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
 
 		/* prepare the sort function for this dimension */
 		multi_sort_add_dimension(mss, i, type->lt_opr);
-
-		/* accumulate all the data for both columns into an array and sort it */
-		for (j = 0; j < numrows; j++)
-		{
-			items[j].values[i] =
-				heap_getattr(rows[j], attnums[dependency[i]],
-							 stats[i]->tupDesc, &items[j].isnull[i]);
-		}
 	}
 
-	/* sort the items so that we can detect the groups */
-	qsort_arg((void *) items, numrows, sizeof(SortItem),
-			  multi_sort_compare, mss);
+	/*
+	 * build an array of SortItem(s) sorted using the multi-sort support
+	 *
+	 * XXX This relies on all stats entries pointing to the same tuple
+	 * descriptor. Not sure if that might not be the case.
+	 */
+	items = build_sorted_items(numrows, rows, stats[0]->tupDesc,
+							   mss, k, attnums_dep);
 
 	/*
 	 * Walk through the sorted array, split it into rows according to the
@@ -322,9 +306,9 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
 	}
 
 	pfree(items);
-	pfree(values);
-	pfree(isnull);
 	pfree(mss);
+	pfree(attnums);
+	pfree(attnums_dep);
 
 	/* Compute the 'degree of validity' as (supporting/total). */
 	return (n_supporting_rows * 1.0 / numrows);
@@ -351,7 +335,6 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
 						   VacAttrStats **stats)
 {
 	int			i,
-				j,
 				k;
 	int			numattrs;
 	int		   *attnums;
@@ -364,11 +347,7 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
 	/*
 	 * Transform the bms into an array, to make accessing i-th member easier.
 	 */
-	attnums = palloc(sizeof(int) * bms_num_members(attrs));
-	i = 0;
-	j = -1;
-	while ((j = bms_next_member(attrs, j)) >= 0)
-		attnums[i++] = j;
+	attnums = build_attnums(attrs);
 
 	Assert(numattrs >= 2);
 
@@ -938,6 +917,9 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 	 * the attnums for each clause in a list which we'll reference later so we
 	 * don't need to repeat the same work again. We'll also keep track of all
 	 * attnums seen.
+	 *
+	 * We also skip clauses that we already estimated using different types of
+	 * statistics (we treat them as incompatible).
 	 */
 	listidx = 0;
 	foreach(l, clauses)
@@ -945,7 +927,8 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 		Node	   *clause = (Node *) lfirst(l);
 		AttrNumber	attnum;
 
-		if (dependency_is_compatible_clause(clause, rel->relid, &attnum))
+		if ((dependency_is_compatible_clause(clause, rel->relid, &attnum)) &&
+			(!bms_is_member(listidx, *estimatedclauses)))
 		{
 			list_attnums[listidx] = attnum;
 			clauses_attnums = bms_add_member(clauses_attnums, attnum);
@@ -1015,8 +998,7 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 			/*
 			 * Skip incompatible clauses, and ones we've already estimated on.
 			 */
-			if (list_attnums[listidx] == InvalidAttrNumber ||
-				bms_is_member(listidx, *estimatedclauses))
+			if (list_attnums[listidx] == InvalidAttrNumber)
 				continue;
 
 			/*
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index db4987b..ee64214 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -53,7 +53,7 @@ static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
 					  int nvacatts, VacAttrStats **vacatts);
 static void statext_store(Relation pg_stext, Oid relid,
 			  MVNDistinct *ndistinct, MVDependencies *dependencies,
-			  VacAttrStats **stats);
+			  MCVList *mcvlist, VacAttrStats **stats);
 
 
 /*
@@ -86,6 +86,7 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
 		StatExtEntry *stat = (StatExtEntry *) lfirst(lc);
 		MVNDistinct *ndistinct = NULL;
 		MVDependencies *dependencies = NULL;
+		MCVList	   *mcv = NULL;
 		VacAttrStats **stats;
 		ListCell   *lc2;
 
@@ -122,10 +123,12 @@ BuildRelationExtStatistics(Relation onerel, double totalrows,
 			else if (t == STATS_EXT_DEPENDENCIES)
 				dependencies = statext_dependencies_build(numrows, rows,
 														  stat->columns, stats);
+			else if (t == STATS_EXT_MCV)
+				mcv = statext_mcv_build(numrows, rows, stat->columns, stats);
 		}
 
 		/* store the statistics in the catalog */
-		statext_store(pg_stext, stat->statOid, ndistinct, dependencies, stats);
+		statext_store(pg_stext, stat->statOid, ndistinct, dependencies, mcv, stats);
 	}
 
 	heap_close(pg_stext, RowExclusiveLock);
@@ -153,6 +156,10 @@ statext_is_kind_built(HeapTuple htup, char type)
 			attnum = Anum_pg_statistic_ext_stxdependencies;
 			break;
 
+		case STATS_EXT_MCV:
+			attnum = Anum_pg_statistic_ext_stxmcv;
+			break;
+
 		default:
 			elog(ERROR, "unexpected statistics type requested: %d", type);
 	}
@@ -217,7 +224,8 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid)
 		for (i = 0; i < ARR_DIMS(arr)[0]; i++)
 		{
 			Assert((enabled[i] == STATS_EXT_NDISTINCT) ||
-				   (enabled[i] == STATS_EXT_DEPENDENCIES));
+				   (enabled[i] == STATS_EXT_DEPENDENCIES) ||
+				   (enabled[i] == STATS_EXT_MCV));
 			entry->types = lappend_int(entry->types, (int) enabled[i]);
 		}
 
@@ -286,13 +294,59 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs,
 }
 
 /*
+ * Find attnums of MV stats using the mvoid.
+ */
+int2vector *
+find_ext_attnums(Oid mvoid, Oid *relid)
+{
+	ArrayType  *arr;
+	Datum       adatum;
+	bool        isnull;
+	HeapTuple   htup;
+	int2vector *keys;
+
+	/* Prepare to scan pg_statistic_ext for entries having indrelid = this rel. */
+	htup = SearchSysCache1(STATEXTOID,
+						   ObjectIdGetDatum(mvoid));
+
+	/* XXX syscache contains OIDs of deleted stats (not invalidated) */
+	if (!HeapTupleIsValid(htup))
+		return NULL;
+
+	/* starelid */
+	adatum = SysCacheGetAttr(STATEXTOID, htup,
+							 Anum_pg_statistic_ext_stxrelid, &isnull);
+	Assert(!isnull);
+
+	*relid = DatumGetObjectId(adatum);
+
+	/* stakeys */
+	adatum = SysCacheGetAttr(STATEXTOID, htup,
+							 Anum_pg_statistic_ext_stxkeys, &isnull);
+	Assert(!isnull);
+
+	arr = DatumGetArrayTypeP(adatum);
+
+	keys = buildint2vector((int16 *) ARR_DATA_PTR(arr),
+						   ARR_DIMS(arr)[0]);
+	ReleaseSysCache(htup);
+
+	/*
+	 * TODO maybe save the list into relcache, as in RelationGetIndexList
+	 * (which was used as an inspiration of this one)?.
+	 */
+
+	return keys;
+}
+
+/*
  * statext_store
  *	Serializes the statistics and stores them into the pg_statistic_ext tuple.
  */
 static void
 statext_store(Relation pg_stext, Oid statOid,
 			  MVNDistinct *ndistinct, MVDependencies *dependencies,
-			  VacAttrStats **stats)
+			  MCVList *mcv, VacAttrStats **stats)
 {
 	HeapTuple	stup,
 				oldtup;
@@ -323,9 +377,18 @@ statext_store(Relation pg_stext, Oid statOid,
 		values[Anum_pg_statistic_ext_stxdependencies - 1] = PointerGetDatum(data);
 	}
 
+	if (mcv != NULL)
+	{
+		bytea	   *data = statext_mcv_serialize(mcv, stats);
+
+		nulls[Anum_pg_statistic_ext_stxmcv - 1] = (data == NULL);
+		values[Anum_pg_statistic_ext_stxmcv - 1] = PointerGetDatum(data);
+	}
+
 	/* always replace the value (either by bytea or NULL) */
 	replaces[Anum_pg_statistic_ext_stxndistinct - 1] = true;
 	replaces[Anum_pg_statistic_ext_stxdependencies - 1] = true;
+	replaces[Anum_pg_statistic_ext_stxmcv - 1] = true;
 
 	/* there should already be a pg_statistic_ext tuple */
 	oldtup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
@@ -432,6 +495,137 @@ multi_sort_compare_dims(int start, int end,
 	return 0;
 }
 
+int
+compare_scalars_simple(const void *a, const void *b, void *arg)
+{
+	return compare_datums_simple(*(Datum *) a,
+								 *(Datum *) b,
+								 (SortSupport) arg);
+}
+
+int
+compare_datums_simple(Datum a, Datum b, SortSupport ssup)
+{
+	return ApplySortComparator(a, false, b, false, ssup);
+}
+
+/* simple counterpart to qsort_arg */
+void *
+bsearch_arg(const void *key, const void *base, size_t nmemb, size_t size,
+			int (*compar) (const void *, const void *, void *),
+			void *arg)
+{
+	size_t		l,
+				u,
+				idx;
+	const void *p;
+	int			comparison;
+
+	l = 0;
+	u = nmemb;
+	while (l < u)
+	{
+		idx = (l + u) / 2;
+		p = (void *) (((const char *) base) + (idx * size));
+		comparison = (*compar) (key, p, arg);
+
+		if (comparison < 0)
+			u = idx;
+		else if (comparison > 0)
+			l = idx + 1;
+		else
+			return (void *) p;
+	}
+
+	return NULL;
+}
+
+int *
+build_attnums(Bitmapset *attrs)
+{
+	int		i,
+			j;
+	int		numattrs = bms_num_members(attrs);
+	int	   *attnums;
+
+	/* build attnums from the bitmapset */
+	attnums = (int*)palloc(sizeof(int) * numattrs);
+	i = 0;
+	j = -1;
+	while ((j = bms_next_member(attrs, j)) >= 0)
+		attnums[i++] = j;
+
+	return attnums;
+}
+
+/* build_sorted_items
+ * 	build sorted array of SortItem with values from rows
+ *
+ * XXX All the memory is allocated in a single chunk, so that the caller
+ * can simply pfree the return value to release all of it.
+ */
+SortItem *
+build_sorted_items(int numrows, HeapTuple *rows, TupleDesc tdesc,
+				   MultiSortSupport mss, int numattrs, int *attnums)
+{
+	int			i,
+				j,
+				len;
+	int			nvalues = numrows * numattrs;
+
+	/*
+	 * We won't allocate the arrays for each item independenly, but in one
+	 * large chunk and then just set the pointers. This allows the caller
+	 * to simply pfree the return value to release all the memory.
+	 */
+	SortItem   *items;
+	Datum	   *values;
+	bool	   *isnull;
+	char	   *ptr;
+
+	/* Compute the total amount of memory we need (both items and values). */
+	len = numrows * sizeof(SortItem) + nvalues * (sizeof(Datum) + sizeof(bool));
+
+	/* Allocate the memory and split it into the pieces. */
+	ptr = palloc0(len);
+
+	/* items to sort */
+	items = (SortItem *) ptr;
+	ptr += numrows * sizeof(SortItem);
+
+	/* values and null flags */
+	values = (Datum *) ptr;
+	ptr += nvalues * sizeof(Datum);
+
+	isnull = (bool *) ptr;
+	ptr += nvalues * sizeof(bool);
+
+	/* make sure we consumed the whole buffer exactly */
+	Assert((ptr - (char *) items) == len);
+
+	/* fix the pointers to Datum and bool arrays */
+	for (i = 0; i < numrows; i++)
+	{
+		items[i].values = &values[i * numattrs];
+		items[i].isnull = &isnull[i * numattrs];
+
+		/* load the values/null flags from sample rows */
+		for (j = 0; j < numattrs; j++)
+		{
+			items[i].values[j] = heap_getattr(rows[i],
+											  attnums[j], /* attnum */
+											  tdesc,
+											  &items[i].isnull[j]);		/* isnull */
+		}
+	}
+
+	/* do the sort, using the multi-sort */
+	qsort_arg((void *) items, numrows, sizeof(SortItem),
+			  multi_sort_compare, mss);
+
+	return items;
+}
+
 /*
  * has_stats_of_kind
  *		Check whether the list contains statistic of a given kind
@@ -512,3 +706,16 @@ choose_best_statistics(List *stats, Bitmapset *attnums, char requiredkind)
 
 	return best_match;
 }
+
+int
+bms_member_index(Bitmapset *keys, AttrNumber varattno)
+{
+	int	i, j;
+
+	i = -1;
+	j = 0;
+	while (((i = bms_next_member(keys, i)) >= 0) && (i < varattno))
+		j += 1;
+
+	return j;
+}
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
new file mode 100644
index 0000000..391ddcb
--- /dev/null
+++ b/src/backend/statistics/mcv.c
@@ -0,0 +1,1809 @@
+/*-------------------------------------------------------------------------
+ *
+ * mcv.c
+ *	  POSTGRES multivariate MCV lists
+ *
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  src/backend/statistics/mcv.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pg_statistic_ext.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "optimizer/clauses.h"
+#include "statistics/extended_stats_internal.h"
+#include "statistics/statistics.h"
+#include "utils/builtins.h"
+#include "utils/bytea.h"
+#include "utils/fmgroids.h"
+#include "utils/fmgrprotos.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+#include "utils/typcache.h"
+
+/*
+ * Computes size of a serialized MCV item, depending on the number of
+ * dimentions (columns) the statistic is defined on. The datum values are
+ * stored in a separate array (deduplicated, to minimize the size), and
+ * so the serialized items only store uint16 indexes into that array.
+ *
+ * Each serialized item store (in this order):
+ *
+ * - indexes to values	  (ndim * sizeof(uint16))
+ * - null flags			  (ndim * sizeof(bool))
+ * - frequency			  (sizeof(double))
+ *
+ * So in total each MCV item requires this many bytes:
+ *
+ *	 ndim * (sizeof(uint16) + sizeof(bool)) + sizeof(double)
+ */
+#define ITEM_SIZE(ndims)	\
+	(ndims * (sizeof(uint16) + sizeof(bool)) + sizeof(double))
+
+/*
+ * Macros for convenient access to parts of a serialized MCV item.
+ */
+#define ITEM_INDEXES(item)			((uint16*)item)
+#define ITEM_NULLS(item,ndims)		((bool*)(ITEM_INDEXES(item) + ndims))
+#define ITEM_FREQUENCY(item,ndims)	((double*)(ITEM_NULLS(item,ndims) + ndims))
+
+
+static MultiSortSupport build_mss(VacAttrStats **stats, Bitmapset *attrs);
+
+static SortItem *build_distinct_groups(int numrows, SortItem *items,
+					  MultiSortSupport mss, int *ndistinct);
+
+static int count_distinct_groups(int numrows, SortItem *items,
+					  MultiSortSupport mss);
+
+static bool mcv_is_compatible_clause(Node *clause, Index relid,
+					  Bitmapset **attnums);
+
+/*
+ * Builds MCV list from the set of sampled rows.
+ *
+ * The algorithm is quite simple:
+ *
+ *	   (1) sort the data (default collation, '<' for the data type)
+ *
+ *	   (2) count distinct groups, decide how many to keep
+ *
+ *	   (3) build the MCV list using the threshold determined in (2)
+ *
+ *	   (4) remove rows represented by the MCV from the sample
+ *
+ * FIXME: Single-dimensional MCV is sorted by frequency (descending). We
+ * should do that too, because when walking through the list we want to
+ * check the most frequent items first.
+ *
+ * TODO: We're using Datum (8B), even for data types (e.g. int4 or float4).
+ * Maybe we could save some space here, but the bytea compression should
+ * handle it just fine.
+ *
+ * TODO: This probably should not use the ndistinct directly (as computed from
+ * the table, but rather estimate the number of distinct values in the
+ * table), no?
+ */
+MCVList *
+statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
+				 VacAttrStats **stats)
+{
+	int			i;
+	int			numattrs = bms_num_members(attrs);
+	int			ndistinct = 0;
+	int			mcv_threshold = 0;
+	int			nitems = 0;
+
+	int		   *attnums = build_attnums(attrs);
+
+	MCVList	   *mcvlist = NULL;
+
+	/* comparator for all the columns */
+	MultiSortSupport mss = build_mss(stats, attrs);
+
+	/* sort the rows */
+	SortItem   *items = build_sorted_items(numrows, rows, stats[0]->tupDesc,
+										   mss, numattrs, attnums);
+
+	/* transform the sorted rows into groups (sorted by frequency) */
+	SortItem   *groups = build_distinct_groups(numrows, items, mss, &ndistinct);
+
+	/*
+	 * Determine the minimum size of a group to be eligible for MCV list, and
+	 * check how many groups actually pass that threshold. We use 1.25x the
+	 * avarage group size, just like for regular per-column statistics.
+	 *
+	 * XXX We also use a minimum number of 4 rows for mcv_threshold, not sure
+	 * if that's what per-column statistics do too?
+	 *
+	 * But if we can fit all the distinct values in the MCV list (i.e. if
+	 * there are less distinct groups than STATS_MCVLIST_MAX_ITEMS), we'll
+	 * require only 2 rows per group.
+	 *
+	 * XXX Maybe this part (requiring 2 rows per group) is not very reliable?
+	 * Perhaps we should instead estimate the number of groups the way we
+	 * estimate ndistinct (after all, that's what MCV items are), and base our
+	 * decision on that?
+	 */
+	mcv_threshold = 1.25 * numrows / ndistinct;
+	mcv_threshold = (mcv_threshold < 4) ? 4 : mcv_threshold;
+
+	if (ndistinct <= STATS_MCVLIST_MAX_ITEMS)
+		mcv_threshold = 2;
+
+	/* Walk through the groups and stop once we fall below the threshold. */
+	nitems = 0;
+	for (i = 0; i < ndistinct; i++)
+	{
+		if (groups[i].count < mcv_threshold)
+			break;
+
+		nitems++;
+	}
+
+	/*
+	 * At this point we know the number of items for the MCV list. There might
+	 * be none (for uniform distribution with many groups), and in that case
+	 * there will be no MCV list. Otherwise construct the MCV list.
+	 */
+	if (nitems > 0)
+	{
+		/*
+		 * Allocate the MCV list structure, set the global parameters.
+		 */
+		mcvlist = (MCVList *) palloc0(sizeof(MCVList));
+
+		mcvlist->magic = STATS_MCV_MAGIC;
+		mcvlist->type = STATS_MCV_TYPE_BASIC;
+		mcvlist->ndimensions = numattrs;
+		mcvlist->nitems = nitems;
+
+		/*
+		 * Preallocate Datum/isnull arrays (not as a single chunk, as we will
+		 * pass the result outside and thus it needs to be easy to pfree().
+		 *
+		 * XXX On second thought, we're the only ones dealing with MCV lists,
+		 * so we might allocate everything as a single chunk without any risk.
+		 * Not sure it's worth it, though.
+		 */
+		mcvlist->items = (MCVItem **) palloc0(sizeof(MCVItem *) * nitems);
+
+		for (i = 0; i < nitems; i++)
+		{
+			mcvlist->items[i] = (MCVItem *) palloc(sizeof(MCVItem));
+			mcvlist->items[i]->values = (Datum *) palloc(sizeof(Datum) * numattrs);
+			mcvlist->items[i]->isnull = (bool *) palloc(sizeof(bool) * numattrs);
+		}
+
+		/* Copy the first chunk of groups into the result. */
+		for (i = 0; i < nitems; i++)
+		{
+			/* just pointer to the proper place in the list */
+			MCVItem	   *item = mcvlist->items[i];
+
+			/* copy values from the _previous_ group (last item of) */
+			memcpy(item->values, groups[i].values, sizeof(Datum) * numattrs);
+			memcpy(item->isnull, groups[i].isnull, sizeof(bool) * numattrs);
+
+			/* make sure basic assumptions on group size are correct */
+			Assert(groups[i].count >= mcv_threshold);
+			Assert(groups[i].count <= numrows);
+
+			/* groups should be sorted by frequency in descending order */
+			Assert((i == 0) || (groups[i-1].count >= groups[i].count));
+
+			/* and finally the group frequency */
+			item->frequency = (double) groups[i].count / numrows;
+		}
+
+		/* make sure the loops are consistent */
+		Assert(nitems == mcvlist->nitems);
+	}
+
+	pfree(items);
+	pfree(groups);
+
+	return mcvlist;
+}
+
+/*
+ * build_mss
+ *	build MultiSortSupport for the attributes passed in attrs
+ */
+static MultiSortSupport
+build_mss(VacAttrStats **stats, Bitmapset *attrs)
+{
+	int			i, j;
+	int			numattrs = bms_num_members(attrs);
+
+	/* Sort by multiple columns (using array of SortSupport) */
+	MultiSortSupport mss = multi_sort_init(numattrs);
+
+	/* prepare the sort functions for all the attributes */
+	i = 0;
+	j = -1;
+	while ((j = bms_next_member(attrs, j)) >= 0)
+	{
+		VacAttrStats *colstat = stats[i];
+		TypeCacheEntry *type;
+
+		type = lookup_type_cache(colstat->attrtypid, TYPECACHE_LT_OPR);
+		if (type->lt_opr == InvalidOid) /* shouldn't happen */
+			elog(ERROR, "cache lookup failed for ordering operator for type %u",
+				 colstat->attrtypid);
+
+		multi_sort_add_dimension(mss, i, type->lt_opr);
+		i++;
+	}
+
+	return mss;
+}
+
+/*
+ * count_distinct_groups
+ *	count distinct combinations of SortItems in the array
+ *
+ * The array is assumed to be sorted according to the MultiSortSupport.
+ */
+static int
+count_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss)
+{
+	int			i;
+	int			ndistinct;
+
+	ndistinct = 1;
+	for (i = 1; i < numrows; i++)
+		if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
+			ndistinct += 1;
+
+	return ndistinct;
+}
+
+/*
+ * compare_sort_item_count
+ *	comparator for sorting items by count (frequencies) in descending order
+ */
+static int
+compare_sort_item_count(const void *a, const void *b)
+{
+	SortItem   *ia = (SortItem *) a;
+	SortItem   *ib = (SortItem *) b;
+
+	if (ia->count == ib->count)
+		return 0;
+	else if (ia->count > ib->count)
+		return -1;
+
+	return 1;
+}
+
+/*
+ * build_distinct_groups
+ *	build array of SortItems for distinct groups and counts matching items
+ *
+ * The input array is assumed to be sorted
+ */
+static SortItem *
+build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
+					  int *ndistinct)
+{
+	int			i,
+				j;
+	int			ngroups = count_distinct_groups(numrows, items, mss);
+
+	SortItem   *groups = (SortItem *) palloc0(ngroups * sizeof(SortItem));
+
+	j = 0;
+	groups[0] = items[0];
+	groups[0].count = 1;
+
+	for (i = 1; i < numrows; i++)
+	{
+		/* Assume sorted in ascending order. */
+		Assert(multi_sort_compare(&items[i], &items[i - 1], mss) >= 0);
+
+		/* New distinct group detected. */
+		if (multi_sort_compare(&items[i], &items[i - 1], mss) != 0)
+			groups[++j] = items[i];
+
+		groups[j].count++;
+	}
+
+	/* Sort the distinct groups by frequency (in descending order). */
+	pg_qsort((void *) groups, ngroups, sizeof(SortItem),
+			 compare_sort_item_count);
+
+	*ndistinct = ngroups;
+	return groups;
+}
+
+
+/*
+ * statext_mcv_load
+ *		Load the MCV list for the indicated pg_statistic_ext tuple
+ */
+MCVList *
+statext_mcv_load(Oid mvoid)
+{
+	bool		isnull = false;
+	Datum		mcvlist;
+	HeapTuple	htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(mvoid));
+
+	if (!HeapTupleIsValid(htup))
+		elog(ERROR, "cache lookup failed for statistics object %u", mvoid);
+
+	mcvlist = SysCacheGetAttr(STATEXTOID, htup,
+							  Anum_pg_statistic_ext_stxmcv, &isnull);
+
+	Assert(!isnull);
+
+	ReleaseSysCache(htup);
+
+	return statext_mcv_deserialize(DatumGetByteaP(mcvlist));
+}
+
+
+/*
+ * Serialize MCV list into a bytea value.
+ *
+ * The basic algorithm is simple:
+ *
+ * (1) perform deduplication (for each attribute separately)
+ *	   (a) collect all (non-NULL) attribute values from all MCV items
+ *	   (b) sort the data (using 'lt' from VacAttrStats)
+ *	   (c) remove duplicate values from the array
+ *
+ * (2) serialize the arrays into a bytea value
+ *
+ * (3) process all MCV list items
+ *	   (a) replace values with indexes into the arrays
+ *
+ * Each attribute has to be processed separately, as we may be mixing different
+ * datatypes, with different sort operators, etc.
+ *
+ * We use uint16 values for the indexes in step (3), as we currently don't allow
+ * more than 8k MCV items anyway, although that's mostly arbitrary limit. We might
+ * increase this to 65k and still fit into uint16. Furthermore, this limit is on
+ * the number of distinct values per column, and we usually have few of those
+ * (and various combinations of them for the those MCV list). So uint16 seems fine.
+ *
+ * We don't really expect the serialization to save as much space as for
+ * histograms, as we are not doing any bucket splits (which is the source
+ * of high redundancy in histograms).
+ *
+ * TODO: Consider packing boolean flags (NULL) for each item into a single char
+ * (or a longer type) instead of using an array of bool items.
+ */
+bytea *
+statext_mcv_serialize(MCVList *mcvlist, VacAttrStats **stats)
+{
+	int			i;
+	int			dim;
+	int			ndims = mcvlist->ndimensions;
+	int			itemsize = ITEM_SIZE(ndims);
+
+	SortSupport ssup;
+	DimensionInfo *info;
+
+	Size		total_length;
+
+	/* allocate the item just once */
+	char	   *item = palloc0(itemsize);
+
+	/* serialized items (indexes into arrays, etc.) */
+	bytea	   *output;
+	char	   *data = NULL;
+
+	/* values per dimension (and number of non-NULL values) */
+	Datum	  **values = (Datum **) palloc0(sizeof(Datum *) * ndims);
+	int		   *counts = (int *) palloc0(sizeof(int) * ndims);
+
+	/*
+	 * We'll include some rudimentary information about the attributes (type
+	 * length, etc.), so that we don't have to look them up while
+	 * deserializing the MCV list.
+	 *
+	 * XXX Maybe this is not a great idea? Or maybe we should actually copy
+	 * more fields, e.g. typeid, which would allow us to display the MCV list
+	 * using only the serialized representation (currently we have to fetch
+	 * this info from the relation).
+	 */
+	info = (DimensionInfo *) palloc0(sizeof(DimensionInfo) * ndims);
+
+	/* sort support data for all attributes included in the MCV list */
+	ssup = (SortSupport) palloc0(sizeof(SortSupportData) * ndims);
+
+	/* collect and deduplicate values for each dimension (attribute) */
+	for (dim = 0; dim < ndims; dim++)
+	{
+		int			ndistinct;
+		StdAnalyzeData *tmp = (StdAnalyzeData *) stats[dim]->extra_data;
+
+		/* copy important info about the data type (length, by-value) */
+		info[dim].typlen = stats[dim]->attrtype->typlen;
+		info[dim].typbyval = stats[dim]->attrtype->typbyval;
+
+		/* allocate space for values in the attribute and collect them */
+		values[dim] = (Datum *) palloc0(sizeof(Datum) * mcvlist->nitems);
+
+		for (i = 0; i < mcvlist->nitems; i++)
+		{
+			/* skip NULL values - we don't need to deduplicate those */
+			if (mcvlist->items[i]->isnull[dim])
+				continue;
+
+			values[dim][counts[dim]] = mcvlist->items[i]->values[dim];
+			counts[dim] += 1;
+		}
+
+		/* if there are just NULL values in this dimension, we're done */
+		if (counts[dim] == 0)
+			continue;
+
+		/* sort and deduplicate the data */
+		ssup[dim].ssup_cxt = CurrentMemoryContext;
+		ssup[dim].ssup_collation = DEFAULT_COLLATION_OID;
+		ssup[dim].ssup_nulls_first = false;
+
+		PrepareSortSupportFromOrderingOp(tmp->ltopr, &ssup[dim]);
+
+		qsort_arg(values[dim], counts[dim], sizeof(Datum),
+				  compare_scalars_simple, &ssup[dim]);
+
+		/*
+		 * Walk through the array and eliminate duplicate values, but keep the
+		 * ordering (so that we can do bsearch later). We know there's at least
+		 * one item as (counts[dim] != 0), so we can skip the first element.
+		 */
+		ndistinct = 1;			/* number of distinct values */
+		for (i = 1; i < counts[dim]; i++)
+		{
+			/* expect sorted array */
+			Assert(compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]) <= 0);
+
+			/* if the value is the same as the previous one, we can skip it */
+			if (!compare_datums_simple(values[dim][i - 1], values[dim][i], &ssup[dim]))
+				continue;
+
+			values[dim][ndistinct] = values[dim][i];
+			ndistinct += 1;
+		}
+
+		/* we must not exceed UINT16_MAX, as we use uint16 indexes */
+		Assert(ndistinct <= UINT16_MAX);
+
+		/*
+		 * Store additional info about the attribute - number of deduplicated
+		 * values, and also size of the serialized data. For fixed-length data
+		 * types this is trivial to compute, for varwidth types we need to
+		 * actually walk the array and sum the sizes.
+		 */
+		info[dim].nvalues = ndistinct;
+
+		if (info[dim].typlen > 0) /* fixed-length data types */
+			info[dim].nbytes = info[dim].nvalues * info[dim].typlen;
+		else if (info[dim].typlen == -1)	/* varlena */
+		{
+			info[dim].nbytes = 0;
+			for (i = 0; i < info[dim].nvalues; i++)
+				info[dim].nbytes += VARSIZE_ANY(values[dim][i]);
+		}
+		else if (info[dim].typlen == -2)	/* cstring */
+		{
+			info[dim].nbytes = 0;
+			for (i = 0; i < info[dim].nvalues; i++)
+				info[dim].nbytes += strlen(DatumGetPointer(values[dim][i]));
+		}
+
+		/* we know (count>0) so there must be some data */
+		Assert(info[dim].nbytes > 0);
+	}
+
+	/*
+	 * Now we can finally compute how much space we'll actually need for the
+	 * whole serialized MCV list, as it contains these fields:
+	 *
+	 * - length (4B) for varlena
+	 * - magic (4B)
+	 * - type (4B)
+	 * - ndimensions (4B)
+	 * - nitems (4B)
+	 * - info (ndim * sizeof(DimensionInfo)
+	 * - arrays of values for each dimension
+	 * - serialized items (nitems * itemsize)
+	 *
+	 * So the 'header' size is 20B + ndim * sizeof(DimensionInfo) and then we
+	 * will place all the data (values + indexes). We'll however use offsetof
+	 * and sizeof to compute sizes of the structs.
+	 */
+	total_length = (sizeof(int32) + offsetof(MCVList, items)
+					+ (ndims * sizeof(DimensionInfo))
+					+ mcvlist->nitems * itemsize);
+
+	/* add space for the arrays of deduplicated values */
+	for (i = 0; i < ndims; i++)
+		total_length += info[i].nbytes;
+
+	/*
+	 * Enforce arbitrary limit of 1MB on the size of the serialized MCV list.
+	 * This is meant as a protection against someone building MCV list on long
+	 * values (e.g. text documents).
+	 *
+	 * XXX Should we enforce arbitrary limits like this one? Maybe it's not
+	 * even necessary, as long values are usually unique and so won't make it
+	 * into the MCV list in the first place. In the end, we have a 1GB limit
+	 * on bytea values.
+	 */
+	if (total_length > (1024 * 1024))
+		elog(ERROR, "serialized MCV list exceeds 1MB (%ld)", total_length);
+
+	/* allocate space for the serialized MCV list, set header fields */
+	output = (bytea *) palloc0(total_length);
+	SET_VARSIZE(output, total_length);
+
+	/* 'data' points to the current position in the output buffer */
+	data = VARDATA(output);
+
+	/* MCV list header (number of items, ...) */
+	memcpy(data, mcvlist, offsetof(MCVList, items));
+	data += offsetof(MCVList, items);
+
+	/* information about the attributes */
+	memcpy(data, info, sizeof(DimensionInfo) * ndims);
+	data += sizeof(DimensionInfo) * ndims;
+
+	/* Copy the deduplicated values for all attributes to the output. */
+	for (dim = 0; dim < ndims; dim++)
+	{
+#ifdef USE_ASSERT_CHECKING
+		/* remember the starting point for Asserts later */
+		char	   *tmp = data;
+#endif
+		for (i = 0; i < info[dim].nvalues; i++)
+		{
+			Datum		v = values[dim][i];
+
+			if (info[dim].typbyval)		/* passed by value */
+			{
+				memcpy(data, &v, info[dim].typlen);
+				data += info[dim].typlen;
+			}
+			else if (info[dim].typlen > 0)		/* pased by reference */
+			{
+				memcpy(data, DatumGetPointer(v), info[dim].typlen);
+				data += info[dim].typlen;
+			}
+			else if (info[dim].typlen == -1)		/* varlena */
+			{
+				memcpy(data, DatumGetPointer(v), VARSIZE_ANY(v));
+				data += VARSIZE_ANY(v);
+			}
+			else if (info[dim].typlen == -2)		/* cstring */
+			{
+				memcpy(data, DatumGetPointer(v), strlen(DatumGetPointer(v)) + 1);
+				data += strlen(DatumGetPointer(v)) + 1; /* terminator */
+			}
+		}
+
+		/* check we got exactly the amount of data we expected for this dimension */
+		Assert((data - tmp) == info[dim].nbytes);
+	}
+
+	/* finally serialize the items, with uint16 indexes instead of the values */
+	for (i = 0; i < mcvlist->nitems; i++)
+	{
+		MCVItem	   *mcvitem = mcvlist->items[i];
+
+		/* don't write beyond the allocated space */
+		Assert(data <= (char *) output + total_length - itemsize);
+
+		/* reset the item (we only allocate it once and reuse it) */
+		memset(item, 0, itemsize);
+
+		for (dim = 0; dim < ndims; dim++)
+		{
+			Datum	   *v = NULL;
+
+			/* do the lookup only for non-NULL values */
+			if (mcvlist->items[i]->isnull[dim])
+				continue;
+
+			v = (Datum *) bsearch_arg(&mcvitem->values[dim], values[dim],
+									  info[dim].nvalues, sizeof(Datum),
+									  compare_scalars_simple, &ssup[dim]);
+
+			Assert(v != NULL);	/* serialization or deduplication error */
+
+			/* compute index within the array */
+			ITEM_INDEXES(item)[dim] = (v - values[dim]);
+
+			/* check the index is within expected bounds */
+			Assert(ITEM_INDEXES(item)[dim] >= 0);
+			Assert(ITEM_INDEXES(item)[dim] < info[dim].nvalues);
+		}
+
+		/* copy NULL and frequency flags into the item */
+		memcpy(ITEM_NULLS(item, ndims), mcvitem->isnull, sizeof(bool) * ndims);
+		memcpy(ITEM_FREQUENCY(item, ndims), &mcvitem->frequency, sizeof(double));
+
+		/* copy the serialized item into the array */
+		memcpy(data, item, itemsize);
+
+		data += itemsize;
+	}
+
+	/* at this point we expect to match the total_length exactly */
+	Assert((data - (char *) output) == total_length);
+
+	pfree(item);
+	pfree(values);
+	pfree(counts);
+
+	return output;
+}
+
+/*
+ * Reads serialized MCV list into MCVList structure.
+ *
+ * Unlike with histograms, we deserialize the MCV list fully (i.e. we don't
+ * keep the deduplicated arrays and pointers into them), as we don't expect
+ * there bo be a lot of duplicate values. But perhaps that's not true and we
+ * should keep the MCV in serialized form too.
+ *
+ * XXX See how much memory we could save by keeping the deduplicated version
+ * (both for typical and corner cases with few distinct values but many items).
+ */
+MCVList *
+statext_mcv_deserialize(bytea *data)
+{
+	int			dim,
+				i;
+	Size		expected_size;
+	MCVList	   *mcvlist;
+	char	   *tmp;
+
+	int			ndims,
+				nitems,
+				itemsize;
+	DimensionInfo *info = NULL;
+	Datum	  **values = NULL;
+
+	/* local allocation buffer (used only for deserialization) */
+	int			bufflen;
+	char	   *buff;
+	char	   *ptr;
+
+	/* buffer used for the result */
+	int			rbufflen;
+	char	   *rbuff;
+	char	   *rptr;
+
+	if (data == NULL)
+		return NULL;
+
+	/*
+	 * We can't possibly deserialize a MCV list if there's not even a
+	 * complete header.
+	 */
+	if (VARSIZE_ANY_EXHDR(data) < offsetof(MCVList, items))
+		elog(ERROR, "invalid MCV Size %ld (expected at least %ld)",
+			 VARSIZE_ANY_EXHDR(data), offsetof(MCVList, items));
+
+	/* read the MCV list header */
+	mcvlist = (MCVList *) palloc0(sizeof(MCVList));
+
+	/* initialize pointer to the data part (skip the varlena header) */
+	tmp = VARDATA_ANY(data);
+
+	/* get the header and perform further sanity checks */
+	memcpy(mcvlist, tmp, offsetof(MCVList, items));
+	tmp += offsetof(MCVList, items);
+
+	if (mcvlist->magic != STATS_MCV_MAGIC)
+		elog(ERROR, "invalid MCV magic %d (expected %dd)",
+			 mcvlist->magic, STATS_MCV_MAGIC);
+
+	if (mcvlist->type != STATS_MCV_TYPE_BASIC)
+		elog(ERROR, "invalid MCV type %d (expected %dd)",
+			 mcvlist->type, STATS_MCV_TYPE_BASIC);
+
+	if (mcvlist->ndimensions == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid zero-length dimension array in MCVList")));
+	else if (mcvlist->ndimensions > STATS_MAX_DIMENSIONS)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid length (%d) dimension array in MCVList",
+						mcvlist->ndimensions)));
+
+	if (mcvlist->nitems == 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid zero-length item array in MCVList")));
+	else if (mcvlist->nitems > STATS_MCVLIST_MAX_ITEMS)
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid length (%d) item array in MCVList",
+						mcvlist->nitems)));
+
+	nitems = mcvlist->nitems;
+	ndims = mcvlist->ndimensions;
+	itemsize = ITEM_SIZE(ndims);
+
+	/*
+	 * Check amount of data including DimensionInfo for all dimensions and
+	 * also the serialized items (including uint16 indexes). Also, walk
+	 * through the dimension information and add it to the sum.
+	 */
+	expected_size = offsetof(MCVList, items) +
+		ndims * sizeof(DimensionInfo) +
+		(nitems * itemsize);
+
+	/*
+	 * Check that we have at least the dimension and info records, along
+	 * with the items. We don't know the size of the serialized values yet.
+	 * We need to do this check first, before accessing the dimension info.
+	 */
+	if (VARSIZE_ANY_EXHDR(data) < expected_size)
+		elog(ERROR, "invalid MCV size %ld (expected %ld)",
+			 VARSIZE_ANY_EXHDR(data), expected_size);
+
+	/* Now it's safe to access the dimention info. */
+	info = (DimensionInfo *) (tmp);
+	tmp += ndims * sizeof(DimensionInfo);
+
+	/* account for the value arrays */
+	for (dim = 0; dim < ndims; dim++)
+	{
+		/*
+		 * XXX I wonder if we can/should rely on asserts here. Maybe those
+		 * checks should be done every time?
+		 */
+		Assert(info[dim].nvalues >= 0);
+		Assert(info[dim].nbytes >= 0);
+
+		expected_size += info[dim].nbytes;
+	}
+
+	/*
+	 * Nowe we know the total expected MCV size, including all the pieces
+	 * (header, dimension info. items and deduplicated data). So do the
+	 * final check on size.
+	 */
+	if (VARSIZE_ANY_EXHDR(data) != expected_size)
+		elog(ERROR, "invalid MCV size %ld (expected %ld)",
+			 VARSIZE_ANY_EXHDR(data), expected_size);
+
+	/*
+	 * Allocate one large chunk of memory for the intermediate data, needed
+	 * only for deserializing the MCV list (and allocate densely to minimize
+	 * the palloc overhead).
+	 *
+	 * Let's see how much space we'll actually need, and also include space
+	 * for the array with pointers.
+	 *
+	 * We need an array of Datum pointers values for each dimension, so that
+	 * we can easily translate the uint16 indexes. We also need a top-level
+	 * array of pointers to those per-dimension arrays.
+	 *
+	 * For byval types with size matching sizeof(Datum) we can reuse the
+	 * serialized array directly.
+	 */
+	bufflen = sizeof(Datum **) * ndims;	/* space for top-level pointers */
+
+	for (dim = 0; dim < ndims; dim++)
+	{
+		/* for full-size byval types, we reuse the serialized value */
+		if (!(info[dim].typbyval && info[dim].typlen == sizeof(Datum)))
+			bufflen += (sizeof(Datum) * info[dim].nvalues);
+	}
+
+	buff = palloc0(bufflen);
+	ptr = buff;
+
+	values = (Datum **) buff;
+	ptr += (sizeof(Datum *) * ndims);
+
+	/*
+	 * XXX This uses pointers to the original data array (the types not passed
+	 * by value), so when someone frees the memory, e.g. by doing something
+	 * like this:
+	 *
+	 *	bytea * data = ... fetch the data from catalog ...
+	 *	MCVList mcvlist = deserialize_mcv_list(data);
+	 *	pfree(data);
+	 *
+	 * then 'mcvlist' references the freed memory. Should copy the pieces.
+	 */
+	for (dim = 0; dim < ndims; dim++)
+	{
+#ifdef USE_ASSERT_CHECKING
+		/* remember where data for this dimension starts */
+		char *start = tmp;
+#endif
+		if (info[dim].typbyval)
+		{
+			/* passed by value / size matches Datum - just reuse the array */
+			if (info[dim].typlen == sizeof(Datum))
+			{
+				values[dim] = (Datum *) tmp;
+				tmp += info[dim].nbytes;
+
+				/* no overflow of input array */
+				Assert(tmp <= start + info[dim].nbytes);
+			}
+			else
+			{
+				values[dim] = (Datum *) ptr;
+				ptr += (sizeof(Datum) * info[dim].nvalues);
+
+				for (i = 0; i < info[dim].nvalues; i++)
+				{
+					/* just point into the array */
+					memcpy(&values[dim][i], tmp, info[dim].typlen);
+					tmp += info[dim].typlen;
+
+					/* no overflow of input array */
+					Assert(tmp <= start + info[dim].nbytes);
+				}
+			}
+		}
+		else
+		{
+			/* all the other types need a chunk of the buffer */
+			values[dim] = (Datum *) ptr;
+			ptr += (sizeof(Datum) * info[dim].nvalues);
+
+			/* pased by reference, but fixed length (name, tid, ...) */
+			if (info[dim].typlen > 0)
+			{
+				for (i = 0; i < info[dim].nvalues; i++)
+				{
+					/* just point into the array */
+					values[dim][i] = PointerGetDatum(tmp);
+					tmp += info[dim].typlen;
+
+					/* no overflow of input array */
+					Assert(tmp <= start + info[dim].nbytes);
+				}
+			}
+			else if (info[dim].typlen == -1)
+			{
+				/* varlena */
+				for (i = 0; i < info[dim].nvalues; i++)
+				{
+					/* just point into the array */
+					values[dim][i] = PointerGetDatum(tmp);
+					tmp += VARSIZE_ANY(tmp);
+
+					/* no overflow of input array */
+					Assert(tmp <= start + info[dim].nbytes);
+				}
+			}
+			else if (info[dim].typlen == -2)
+			{
+				/* cstring */
+				for (i = 0; i < info[dim].nvalues; i++)
+				{
+					/* just point into the array */
+					values[dim][i] = PointerGetDatum(tmp);
+					tmp += (strlen(tmp) + 1);	/* don't forget the \0 */
+
+					/* no overflow of input array */
+					Assert(tmp <= start + info[dim].nbytes);
+				}
+			}
+		}
+
+		/* check we consumed the serialized data for this dimension exactly */
+		Assert((tmp - start) == info[dim].nbytes);
+	}
+
+	/* we should have exhausted the buffer exactly */
+	Assert((ptr - buff) == bufflen);
+
+	/* allocate space for all the MCV items in a single piece */
+	rbufflen = (sizeof(MCVItem*) + sizeof(MCVItem) +
+				sizeof(Datum) * ndims + sizeof(bool) * ndims) * nitems;
+
+	rbuff = palloc0(rbufflen);
+	rptr = rbuff;
+
+	mcvlist->items = (MCVItem **) rbuff;
+	rptr += (sizeof(MCVItem *) * nitems);
+
+	/* deserialize the MCV items and translate the indexes to Datums */
+	for (i = 0; i < nitems; i++)
+	{
+		uint16	   *indexes = NULL;
+		MCVItem	   *item = (MCVItem *) rptr;
+
+		rptr += (sizeof(MCVItem));
+
+		item->values = (Datum *) rptr;
+		rptr += (sizeof(Datum) * ndims);
+
+		item->isnull = (bool *) rptr;
+		rptr += (sizeof(bool) * ndims);
+
+		/* just point to the right place */
+		indexes = ITEM_INDEXES(tmp);
+
+		memcpy(item->isnull, ITEM_NULLS(tmp, ndims), sizeof(bool) * ndims);
+		memcpy(&item->frequency, ITEM_FREQUENCY(tmp, ndims), sizeof(double));
+
+#ifdef ASSERT_CHECKING
+		/*
+		 * XXX This seems rather useless, considering the 'indexes' array is
+		 * defined as (uint16*).
+		 */
+		for (dim = 0; dim < ndims; dim++)
+			Assert(indexes[dim] <= UINT16_MAX);
+#endif
+
+		/* translate the values */
+		for (dim = 0; dim < ndims; dim++)
+			if (!item->isnull[dim])
+				item->values[dim] = values[dim][indexes[dim]];
+
+		mcvlist->items[i] = item;
+
+		tmp += ITEM_SIZE(ndims);
+
+		/* check we're not overflowing the input */
+		Assert(tmp <= (char *) data + VARSIZE_ANY(data));
+	}
+
+	/* check that we processed all the data */
+	Assert(tmp == (char *) data + VARSIZE_ANY(data));
+
+	/* release the temporary buffer */
+	pfree(buff);
+
+	return mcvlist;
+}
+
+/*
+ * SRF with details about buckets of a histogram:
+ *
+ * - item ID (0...nitems)
+ * - values (string array)
+ * - nulls only (boolean array)
+ * - frequency (double precision)
+ *
+ * The input is the OID of the statistics, and there are no rows returned if
+ * the statistics contains no histogram.
+ */
+PG_FUNCTION_INFO_V1(pg_stats_ext_mcvlist_items);
+
+Datum
+pg_stats_ext_mcvlist_items(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	int			call_cntr;
+	int			max_calls;
+	TupleDesc	tupdesc;
+	AttInMetadata *attinmeta;
+
+	/* stuff done only on the first call of the function */
+	if (SRF_IS_FIRSTCALL())
+	{
+		MemoryContext oldcontext;
+		MCVList	   *mcvlist;
+
+		/* create a function context for cross-call persistence */
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/* switch to memory context appropriate for multiple function calls */
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		mcvlist = statext_mcv_load(PG_GETARG_OID(0));
+
+		funcctx->user_fctx = mcvlist;
+
+		/* total number of tuples to be returned */
+		funcctx->max_calls = 0;
+		if (funcctx->user_fctx != NULL)
+			funcctx->max_calls = mcvlist->nitems;
+
+		/* Build a tuple descriptor for our result type */
+		if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("function returning record called in context "
+							"that cannot accept type record")));
+
+		/* build metadata needed later to produce tuples from raw C-strings */
+		attinmeta = TupleDescGetAttInMetadata(tupdesc);
+		funcctx->attinmeta = attinmeta;
+
+		MemoryContextSwitchTo(oldcontext);
+	}
+
+	/* stuff done on every call of the function */
+	funcctx = SRF_PERCALL_SETUP();
+
+	call_cntr = funcctx->call_cntr;
+	max_calls = funcctx->max_calls;
+	attinmeta = funcctx->attinmeta;
+
+	if (call_cntr < max_calls)	/* do when there is more left to send */
+	{
+		char	  **values;
+		HeapTuple	tuple;
+		Datum		result;
+		int2vector *stakeys;
+		Oid			relid;
+
+		char	   *buff = palloc0(1024);
+		char	   *format;
+
+		int			i;
+
+		Oid		   *outfuncs;
+		FmgrInfo   *fmgrinfo;
+
+		MCVList	   *mcvlist;
+		MCVItem	   *item;
+
+		mcvlist = (MCVList *) funcctx->user_fctx;
+
+		Assert(call_cntr < mcvlist->nitems);
+
+		item = mcvlist->items[call_cntr];
+
+		stakeys = find_ext_attnums(PG_GETARG_OID(0), &relid);
+
+		/*
+		 * Prepare a values array for building the returned tuple. This should
+		 * be an array of C strings which will be processed later by the type
+		 * input functions.
+		 */
+		values = (char **) palloc(4 * sizeof(char *));
+
+		values[0] = (char *) palloc(64 * sizeof(char));
+
+		/* arrays */
+		values[1] = (char *) palloc0(1024 * sizeof(char));
+		values[2] = (char *) palloc0(1024 * sizeof(char));
+
+		/* frequency */
+		values[3] = (char *) palloc(64 * sizeof(char));
+
+		outfuncs = (Oid *) palloc0(sizeof(Oid) * mcvlist->ndimensions);
+		fmgrinfo = (FmgrInfo *) palloc0(sizeof(FmgrInfo) * mcvlist->ndimensions);
+
+		for (i = 0; i < mcvlist->ndimensions; i++)
+		{
+			bool		isvarlena;
+
+			getTypeOutputInfo(get_atttype(relid, stakeys->values[i]),
+							  &outfuncs[i], &isvarlena);
+
+			fmgr_info(outfuncs[i], &fmgrinfo[i]);
+		}
+
+		snprintf(values[0], 64, "%d", call_cntr);		/* item ID */
+
+		for (i = 0; i < mcvlist->ndimensions; i++)
+		{
+			Datum		val,
+						valout;
+
+			format = "%s, %s";
+			if (i == 0)
+				format = "{%s%s";
+			else if (i == mcvlist->ndimensions - 1)
+				format = "%s, %s}";
+
+			if (item->isnull[i])
+				valout = CStringGetDatum("NULL");
+			else
+			{
+				val = item->values[i];
+				valout = FunctionCall1(&fmgrinfo[i], val);
+			}
+
+			snprintf(buff, 1024, format, values[1], DatumGetPointer(valout));
+			strncpy(values[1], buff, 1023);
+			buff[0] = '\0';
+
+			snprintf(buff, 1024, format, values[2], item->isnull[i] ? "t" : "f");
+			strncpy(values[2], buff, 1023);
+			buff[0] = '\0';
+		}
+
+		snprintf(values[3], 64, "%f", item->frequency); /* frequency */
+
+		/* build a tuple */
+		tuple = BuildTupleFromCStrings(attinmeta, values);
+
+		/* make the tuple into a datum */
+		result = HeapTupleGetDatum(tuple);
+
+		/* clean up (this is not really necessary) */
+		pfree(values[0]);
+		pfree(values[1]);
+		pfree(values[2]);
+		pfree(values[3]);
+
+		pfree(values);
+
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else	/* do when there is no more left */
+	{
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
+/*
+ * pg_mcv_list_in		- input routine for type pg_mcv_list.
+ *
+ * pg_mcv_list is real enough to be a table column, but it has no operations
+ * of its own, and disallows input too
+ */
+Datum
+pg_mcv_list_in(PG_FUNCTION_ARGS)
+{
+	/*
+	 * pg_mcv_list stores the data in binary form and parsing text input is
+	 * not needed, so disallow this.
+	 */
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "pg_mcv_list")));
+
+	PG_RETURN_VOID();			/* keep compiler quiet */
+}
+
+
+/*
+ * pg_mcv_list_out		- output routine for type PG_MCV_LIST.
+ *
+ * MCV lists are serialized into a bytea value, so we simply call byteaout()
+ * to serialize the value into text. But it'd be nice to serialize that into
+ * a meaningful representation (e.g. for inspection by people).
+ *
+ * XXX This should probably return something meaningful, similar to what
+ * pg_dependencies_out does. Not sure how to deal with the deduplicated
+ * values, though - do we want to expand that or not?
+ */
+Datum
+pg_mcv_list_out(PG_FUNCTION_ARGS)
+{
+	return byteaout(fcinfo);
+}
+
+/*
+ * pg_mcv_list_recv		- binary input routine for type pg_mcv_list.
+ */
+Datum
+pg_mcv_list_recv(PG_FUNCTION_ARGS)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("cannot accept a value of type %s", "pg_mcv_list")));
+
+	PG_RETURN_VOID();			/* keep compiler quiet */
+}
+
+/*
+ * pg_mcv_list_send		- binary output routine for type pg_mcv_list.
+ *
+ * MCV lists are serialized in a bytea value (although the type is named
+ * differently), so let's just send that.
+ */
+Datum
+pg_mcv_list_send(PG_FUNCTION_ARGS)
+{
+	return byteasend(fcinfo);
+}
+
+/*
+ * mcv_is_compatible_clause_internal
+ *	Does the heavy lifting of actually inspecting the clauses for
+ * mcv_is_compatible_clause.
+ */
+static bool
+mcv_is_compatible_clause_internal(Node *clause, Index relid, Bitmapset **attnums)
+{
+	/* We only support plain Vars for now */
+	if (IsA(clause, Var))
+	{
+		Var *var = (Var *) clause;
+
+		/* Ensure var is from the correct relation */
+		if (var->varno != relid)
+			return false;
+
+		/* we also better ensure the Var is from the current level */
+		if (var->varlevelsup > 0)
+			return false;
+
+		/* Also skip system attributes (we don't allow stats on those). */
+		if (!AttrNumberIsForUserDefinedAttr(var->varattno))
+			return false;
+
+		*attnums = bms_add_member(*attnums, var->varattno);
+
+		return true;
+	}
+
+	/* Var = Const */
+	if (is_opclause(clause))
+	{
+		OpExpr	   *expr = (OpExpr *) clause;
+		Var		   *var;
+		bool		varonleft = true;
+		bool		ok;
+
+		/* Only expressions with two arguments are considered compatible. */
+		if (list_length(expr->args) != 2)
+			return false;
+
+		/* see if it actually has the right */
+		ok = (NumRelids((Node *) expr) == 1) &&
+			(is_pseudo_constant_clause(lsecond(expr->args)) ||
+			 (varonleft = false,
+			  is_pseudo_constant_clause(linitial(expr->args))));
+
+		/* unsupported structure (two variables or so) */
+		if (!ok)
+			return false;
+
+		/*
+		 * If it's not one of the supported operators ("=", "<", ">", etc.),
+		 * just ignore the clause, as it's not compatible with MCV lists.
+		 *
+		 * This uses the function for estimating selectivity, not the operator
+		 * directly (a bit awkward, but well ...).
+		 */
+		if ((get_oprrest(expr->opno) != F_EQSEL) &&
+			(get_oprrest(expr->opno) != F_SCALARLTSEL) &&
+			(get_oprrest(expr->opno) != F_SCALARGTSEL))
+			return false;
+
+		var = (varonleft) ? linitial(expr->args) : lsecond(expr->args);
+
+		return mcv_is_compatible_clause_internal((Node *)var, relid, attnums);
+	}
+
+	/* NOT clause, clause AND/OR clause */
+	if (or_clause(clause) ||
+		and_clause(clause) ||
+		not_clause(clause))
+	{
+		/*
+		 * AND/OR/NOT-clauses are supported if all sub-clauses are supported
+		 *
+		 * TODO: We might support mixed case, where some of the clauses are
+		 * supported and some are not, and treat all supported subclauses as a
+		 * single clause, compute it's selectivity using mv stats, and compute
+		 * the total selectivity using the current algorithm.
+		 *
+		 * TODO: For RestrictInfo above an OR-clause, we might use the
+		 * orclause with nested RestrictInfo - we won't have to call
+		 * pull_varnos() for each clause, saving time.
+		 */
+		BoolExpr   *expr = (BoolExpr *) clause;
+		ListCell   *lc;
+		Bitmapset  *clause_attnums = NULL;
+
+		foreach(lc, expr->args)
+		{
+			/*
+			 * Had we found incompatible clause in the arguments, treat the
+			 * whole clause as incompatible.
+			 */
+			if (!mcv_is_compatible_clause_internal((Node *) lfirst(lc),
+												   relid, &clause_attnums))
+				return false;
+		}
+
+		/*
+		 * Otherwise the clause is compatible, and we need to merge the
+		 * attnums into the main bitmapset.
+		 */
+		*attnums = bms_join(*attnums, clause_attnums);
+
+		return true;
+	}
+
+	/* Var IS NULL */
+	if (IsA(clause, NullTest))
+	{
+		NullTest   *nt = (NullTest *) clause;
+
+		/*
+		 * Only simple (Var IS NULL) expressions supported for now. Maybe we
+		 * could use examine_variable to fix this?
+		 */
+		if (!IsA(nt->arg, Var))
+			return false;
+
+		return mcv_is_compatible_clause_internal((Node *) (nt->arg), relid, attnums);
+	}
+
+	return false;
+}
+
+/*
+ * mcv_is_compatible_clause
+ *		Determines if the clause is compatible with MCV lists
+ *
+ * Only OpExprs with two arguments using an equality operator are supported.
+ * When returning True attnum is set to the attribute number of the Var within
+ * the supported clause.
+ *
+ * Currently we only support Var = Const, or Const = Var. It may be possible
+ * to expand on this later.
+ */
+static bool
+mcv_is_compatible_clause(Node *clause, Index relid, Bitmapset **attnums)
+{
+	RestrictInfo *rinfo = (RestrictInfo *) clause;
+
+	if (!IsA(rinfo, RestrictInfo))
+		return false;
+
+	/* Pseudoconstants are not really interesting here. */
+	if (rinfo->pseudoconstant)
+		return false;
+
+	/* clauses referencing multiple varnos are incompatible */
+	if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON)
+		return false;
+
+	return mcv_is_compatible_clause_internal((Node *)rinfo->clause,
+											 relid, attnums);
+}
+
+#define UPDATE_RESULT(m,r,isor) \
+	(m) = (isor) ? (Max(m,r)) : (Min(m,r))
+
+/*
+ * mcv_update_match_bitmap
+ *	Evaluate clauses using the MCV list, and update the match bitmap.
+ *
+ * A match bitmap keeps match/mismatch status for each MCV item, and we
+ * update it based on additional clauses. We also use it to skip items
+ * that can't possibly match (e.g. item marked as "mismatch" can't change
+ * to "match" when evaluating AND clause list).
+ *
+ * The function returns the number of items currently marked as 'match', and
+ * it also returns two additional pieces of information - a flag indicating
+ * whether there was an equality condition for all attributes, and the
+ * minimum frequency in the MCV list.
+ *
+ * XXX Currently the match bitmap uses a char for each MCV item, which is
+ * somewhat wasteful as we could do with just a single bit, thus reducing
+ * the size to ~1/8. It would also allow us to combine bitmaps simply using
+ * & and |, which should be faster than min/max. The bitmaps are fairly
+ * small, though (as we cap the MCV list size to 8k items).
+ */
+static int
+mcv_update_match_bitmap(PlannerInfo *root, List *clauses,
+						Bitmapset *keys, MCVList *mcvlist,
+						int nmatches, char *matches,
+						Selectivity *lowsel, bool *fullmatch,
+						bool is_or)
+{
+	int			i;
+	ListCell   *l;
+
+	Bitmapset  *eqmatches = NULL;		/* attributes with equality matches */
+
+	/* The bitmap may be partially built. */
+	Assert(nmatches >= 0);
+	Assert(nmatches <= mcvlist->nitems);
+	Assert(clauses != NIL);
+	Assert(list_length(clauses) >= 1);
+	Assert(mcvlist != NULL);
+	Assert(mcvlist->nitems > 0);
+
+	/*
+	 * Handle cases where either all MCV items are marked as mismatch (AND),
+	 * or match (OR). In those cases additional clauses can't possibly change
+	 * match status of any items, so don't waste time by trying.
+	 */
+	if (((nmatches == 0) && (!is_or)) ||			/* AND-ed clauses */
+		((nmatches == mcvlist->nitems) && is_or))	/* OR-ed clauses */
+		return nmatches;
+
+	/*
+	 * Find the lowest frequency in the MCV list. The MCV list is sorted by
+	 * frequency in descending order, so simply get frequency of the the last
+	 * MCV item.
+	 */
+	*lowsel = mcvlist->items[mcvlist->nitems-1]->frequency;
+
+	/*
+	 * Loop through the list of clauses, and for each of them evaluate all the
+	 * MCV items not yet eliminated by the preceding clauses.
+	 */
+	foreach(l, clauses)
+	{
+		Node	   *clause = (Node *) lfirst(l);
+
+		/* if it's a RestrictInfo, then extract the clause */
+		if (IsA(clause, RestrictInfo))
+			clause = (Node *) ((RestrictInfo *) clause)->clause;
+
+		/*
+		 * Check it still makes sense to continue evaluating the clauses on the
+		 * MCV list, just like we did at the very beginning.
+		 */
+		if (((nmatches == 0) && (!is_or)) ||
+			((nmatches == mcvlist->nitems) && is_or))
+			break;
+
+		/* Handle the various types of clauses - OpClause, NullTest and AND/OR/NOT */
+		if (is_opclause(clause))
+		{
+			OpExpr	   *expr = (OpExpr *) clause;
+			bool		varonleft = true;
+			bool		ok;
+			FmgrInfo	opproc;
+
+			/* get procedure computing operator selectivity */
+			RegProcedure oprrest = get_oprrest(expr->opno);
+
+			fmgr_info(get_opcode(expr->opno), &opproc);
+
+			ok = (NumRelids(clause) == 1) &&
+				(is_pseudo_constant_clause(lsecond(expr->args)) ||
+				 (varonleft = false,
+				  is_pseudo_constant_clause(linitial(expr->args))));
+
+			if (ok)
+			{
+
+				FmgrInfo	gtproc;
+				Var		   *var = (varonleft) ? linitial(expr->args) : lsecond(expr->args);
+				Const	   *cst = (varonleft) ? lsecond(expr->args) : linitial(expr->args);
+				bool		isgt = (!varonleft);
+
+				TypeCacheEntry *typecache
+				= lookup_type_cache(var->vartype, TYPECACHE_GT_OPR);
+
+				/* FIXME proper matching attribute to dimension */
+				int			idx = bms_member_index(keys, var->varattno);
+
+				fmgr_info(get_opcode(typecache->gt_opr), &gtproc);
+
+				/*
+				 * Walk through the MCV items and evaluate the current clause.
+				 * We can skip items that were already ruled out, and
+				 * terminate if there are no remaining MCV items that might
+				 * possibly match.
+				 */
+				for (i = 0; i < mcvlist->nitems; i++)
+				{
+					bool		mismatch = false;
+					MCVItem	   *item = mcvlist->items[i];
+
+					/*
+					 * If there are no more matches (AND) or no remaining
+					 * unmatched items (OR), we can stop processing this
+					 * clause.
+					 */
+					if (((nmatches == 0) && (!is_or)) ||
+						((nmatches == mcvlist->nitems) && is_or))
+						break;
+
+					/*
+					 * For AND-lists, we can also mark NULL items as 'no
+					 * match' (and then skip them). For OR-lists this is not
+					 * possible.
+					 */
+					if ((!is_or) && item->isnull[idx])
+						matches[i] = STATS_MATCH_NONE;
+
+					/* skip MCV items that were already ruled out */
+					if ((!is_or) && (matches[i] == STATS_MATCH_NONE))
+						continue;
+					else if (is_or && (matches[i] == STATS_MATCH_FULL))
+						continue;
+
+					switch (oprrest)
+					{
+						case F_EQSEL:
+
+							/*
+							 * We don't care about isgt in equality, because
+							 * it does not matter whether it's (var = const)
+							 * or (const = var).
+							 */
+							mismatch = !DatumGetBool(FunctionCall2Coll(&opproc,
+													   DEFAULT_COLLATION_OID,
+															 cst->constvalue,
+														 item->values[idx]));
+
+							if (!mismatch)
+								eqmatches = bms_add_member(eqmatches, idx);
+
+							break;
+
+						case F_SCALARLTSEL:		/* column < constant */
+						case F_SCALARGTSEL:		/* column > constant */
+
+							/*
+							 * First check whether the constant is below the
+							 * lower boundary (in that case we can skip the
+							 * bucket, because there's no overlap).
+							 */
+							if (isgt)
+								mismatch = !DatumGetBool(FunctionCall2Coll(&opproc,
+														   DEFAULT_COLLATION_OID,
+															 cst->constvalue,
+															item->values[idx]));
+							else
+								mismatch = !DatumGetBool(FunctionCall2Coll(&opproc,
+														   DEFAULT_COLLATION_OID,
+															 item->values[idx],
+															  cst->constvalue));
+
+							break;
+					}
+
+					/*
+					 * XXX The conditions on matches[i] are not needed, as we
+					 * skip MCV items that can't become true/false, depending
+					 * on the current flag. See beginning of the loop over MCV
+					 * items.
+					 */
+
+					if ((is_or) && (matches[i] == STATS_MATCH_NONE) && (!mismatch))
+					{
+						/* OR - was MATCH_NONE, but will be MATCH_FULL */
+						matches[i] = STATS_MATCH_FULL;
+						++nmatches;
+						continue;
+					}
+					else if ((!is_or) && (matches[i] == STATS_MATCH_FULL) && mismatch)
+					{
+						/* AND - was MATC_FULL, but will be MATCH_NONE */
+						matches[i] = STATS_MATCH_NONE;
+						--nmatches;
+						continue;
+					}
+
+				}
+			}
+		}
+		else if (IsA(clause, NullTest))
+		{
+			NullTest   *expr = (NullTest *) clause;
+			Var		   *var = (Var *) (expr->arg);
+
+			/* FIXME proper matching attribute to dimension */
+			int			idx = bms_member_index(keys, var->varattno);
+
+			/*
+			 * Walk through the MCV items and evaluate the current clause. We
+			 * can skip items that were already ruled out, and terminate if
+			 * there are no remaining MCV items that might possibly match.
+			 */
+			for (i = 0; i < mcvlist->nitems; i++)
+			{
+				MCVItem	   *item = mcvlist->items[i];
+
+				/*
+				 * if there are no more matches, we can stop processing this
+				 * clause
+				 */
+				if (nmatches == 0)
+					break;
+
+				/* skip MCV items that were already ruled out */
+				if (matches[i] == STATS_MATCH_NONE)
+					continue;
+
+				/* if the clause mismatches the MCV item, set it as MATCH_NONE */
+				if (((expr->nulltesttype == IS_NULL) && (!item->isnull[idx])) ||
+				((expr->nulltesttype == IS_NOT_NULL) && (item->isnull[idx])))
+				{
+					matches[i] = STATS_MATCH_NONE;
+					--nmatches;
+				}
+			}
+		}
+		else if (or_clause(clause) || and_clause(clause))
+		{
+			/*
+			 * AND/OR clause, with all clauses compatible with the selected MV
+			 * stat
+			 */
+
+			int			i;
+			BoolExpr   *orclause = ((BoolExpr *) clause);
+			List	   *orclauses = orclause->args;
+
+			/* match/mismatch bitmap for each MCV item */
+			int			or_nmatches = 0;
+			char	   *or_matches = NULL;
+
+			Assert(orclauses != NIL);
+			Assert(list_length(orclauses) >= 2);
+
+			/* number of matching MCV items */
+			or_nmatches = mcvlist->nitems;
+
+			/* by default none of the MCV items matches the clauses */
+			or_matches = palloc0(sizeof(char) * or_nmatches);
+
+			if (or_clause(clause))
+			{
+				/* OR clauses assume nothing matches, initially */
+				memset(or_matches, STATS_MATCH_NONE, sizeof(char) * or_nmatches);
+				or_nmatches = 0;
+			}
+			else
+			{
+				/* AND clauses assume nothing matches, initially */
+				memset(or_matches, STATS_MATCH_FULL, sizeof(char) * or_nmatches);
+			}
+
+			/* build the match bitmap for the OR-clauses */
+			or_nmatches = mcv_update_match_bitmap(root, orclauses, keys,
+										mcvlist, or_nmatches, or_matches,
+										lowsel, fullmatch, or_clause(clause));
+
+			/* merge the bitmap into the existing one */
+			for (i = 0; i < mcvlist->nitems; i++)
+			{
+				/*
+				 * Merge the result into the bitmap (Min for AND, Max for OR).
+				 *
+				 * FIXME this does not decrease the number of matches
+				 */
+				UPDATE_RESULT(matches[i], or_matches[i], is_or);
+			}
+
+			pfree(or_matches);
+
+		}
+		else
+		{
+			elog(ERROR, "unknown clause type: %d", clause->type);
+		}
+	}
+
+	/*
+	 * If all the columns were matched by equality, it's a full match. In this
+	 * case there can be just a single MCV item, matching the clause (if there
+	 * were two, both would match the other one).
+	 */
+	*fullmatch = (bms_num_members(eqmatches) == mcvlist->ndimensions);
+
+	/* free the allocated pieces */
+	if (eqmatches)
+		pfree(eqmatches);
+
+	return nmatches;
+}
+
+
+Selectivity
+mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
+						   JoinType jointype, SpecialJoinInfo *sjinfo,
+						   RelOptInfo *rel, Bitmapset **estimatedclauses)
+{
+	int			i;
+	ListCell   *l;
+	Bitmapset  *clauses_attnums = NULL;
+	Bitmapset **list_attnums;
+	int			listidx;
+	StatisticExtInfo *stat;
+	MCVList	   *mcv;
+	List	   *mcv_clauses;
+
+	/* match/mismatch bitmap for each MCV item */
+	char	   *matches = NULL;
+	bool		fullmatch;
+	Selectivity lowsel;
+	int			nmatches = 0;
+	Selectivity	s;
+
+	/* check if there's any stats that might be useful for us. */
+	if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV))
+		return 1.0;
+
+	list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) *
+										 list_length(clauses));
+
+	/*
+	 * Pre-process the clauses list to extract the attnums seen in each item.
+	 * We need to determine if there's any clauses which will be useful for
+	 * dependency selectivity estimations. Along the way we'll record all of
+	 * the attnums for each clause in a list which we'll reference later so we
+	 * don't need to repeat the same work again. We'll also keep track of all
+	 * attnums seen.
+	 *
+	 * FIXME Should skip already estimated clauses (using the estimatedclauses
+	 * bitmap).
+	 */
+	listidx = 0;
+	foreach(l, clauses)
+	{
+		Node	   *clause = (Node *) lfirst(l);
+		Bitmapset  *attnums = NULL;
+
+		if (mcv_is_compatible_clause(clause, rel->relid, &attnums))
+		{
+			list_attnums[listidx] = attnums;
+			clauses_attnums = bms_add_members(clauses_attnums, attnums);
+		}
+		else
+			list_attnums[listidx] = NULL;
+
+		listidx++;
+	}
+
+	/* We need at least two attributes for MCV lists. */
+	if (bms_num_members(clauses_attnums) < 2)
+		return 1.0;
+
+	/* find the best suited statistics object for these attnums */
+	stat = choose_best_statistics(rel->statlist, clauses_attnums,
+								  STATS_EXT_MCV);
+
+	/* if no matching stats could be found then we've nothing to do */
+	if (!stat)
+		return 1.0;
+
+	/* load the MCV list stored in the statistics object */
+	mcv = statext_mcv_load(stat->statOid);
+
+	/* now filter the clauses to be estimated using the selected MCV */
+	mcv_clauses = NIL;
+
+	listidx = 0;
+	foreach (l, clauses)
+	{
+		/*
+		 * If the clause is compatible with the selected MCV statistics,
+		 * mark it as estimated and add it to the MCV list.
+		 */
+		if ((list_attnums[listidx] != NULL) &&
+			(bms_is_subset(list_attnums[listidx], stat->keys)))
+		{
+			mcv_clauses = lappend(mcv_clauses, (Node *)lfirst(l));
+			*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
+		}
+
+		listidx++;
+	}
+
+	/* by default all the MCV items match the clauses fully */
+	matches = palloc0(sizeof(char) * mcv->nitems);
+	memset(matches, STATS_MATCH_FULL, sizeof(char) * mcv->nitems);
+
+	/* number of matching MCV items */
+	nmatches = mcv->nitems;
+
+	nmatches = mcv_update_match_bitmap(root, clauses,
+									   stat->keys, mcv,
+									   nmatches, matches,
+									   &lowsel, &fullmatch, false);
+
+	/* sum frequencies for all the matching MCV items */
+	for (i = 0; i < mcv->nitems; i++)
+	{
+		if (matches[i] != STATS_MATCH_NONE)
+			s += mcv->items[i]->frequency;
+	}
+
+	return s;
+}
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 0faa020..80746da 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -1461,6 +1461,7 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
 	bool		isnull;
 	bool		ndistinct_enabled;
 	bool		dependencies_enabled;
+	bool		mcv_enabled;
 	int			i;
 
 	statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid));
@@ -1496,6 +1497,7 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
 
 	ndistinct_enabled = false;
 	dependencies_enabled = false;
+	mcv_enabled = false;
 
 	for (i = 0; i < ARR_DIMS(arr)[0]; i++)
 	{
@@ -1503,6 +1505,8 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
 			ndistinct_enabled = true;
 		if (enabled[i] == STATS_EXT_DEPENDENCIES)
 			dependencies_enabled = true;
+		if (enabled[i] == STATS_EXT_MCV)
+			mcv_enabled = true;
 	}
 
 	/*
@@ -1512,13 +1516,27 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok)
 	 * statistics types on a newer postgres version, if the statistics had all
 	 * options enabled on the original version.
 	 */
-	if (!ndistinct_enabled || !dependencies_enabled)
+	if (!ndistinct_enabled || !dependencies_enabled || !mcv_enabled)
 	{
+		bool	gotone = false;
+
 		appendStringInfoString(&buf, " (");
+
 		if (ndistinct_enabled)
+		{
 			appendStringInfoString(&buf, "ndistinct");
-		else if (dependencies_enabled)
-			appendStringInfoString(&buf, "dependencies");
+			gotone = true;
+		}
+
+		if (dependencies_enabled)
+		{
+			appendStringInfo(&buf, "%sdependencies", gotone ? ", " : "");
+			gotone = true;
+		}
+
+		if (mcv_enabled)
+			appendStringInfo(&buf, "%smcv", gotone ? ", " : "");
+
 		appendStringInfoChar(&buf, ')');
 	}
 
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 798e710..bedd3db 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -2382,7 +2382,8 @@ describeOneTableDetails(const char *schemaname,
 							  "   JOIN pg_catalog.pg_attribute a ON (stxrelid = a.attrelid AND\n"
 							  "        a.attnum = s.attnum AND NOT attisdropped)) AS columns,\n"
 							  "  (stxkind @> '{d}') AS ndist_enabled,\n"
-							  "  (stxkind @> '{f}') AS deps_enabled\n"
+							  "  (stxkind @> '{f}') AS deps_enabled,\n"
+							  "  (stxkind @> '{m}') AS mcv_enabled\n"
 							  "FROM pg_catalog.pg_statistic_ext stat "
 							  "WHERE stxrelid = '%s'\n"
 							  "ORDER BY 1;",
@@ -2419,6 +2420,12 @@ describeOneTableDetails(const char *schemaname,
 					if (strcmp(PQgetvalue(result, i, 6), "t") == 0)
 					{
 						appendPQExpBuffer(&buf, "%sdependencies", gotone ? ", " : "");
+						gotone = true;
+					}
+
+					if (strcmp(PQgetvalue(result, i, 7), "t") == 0)
+					{
+						appendPQExpBuffer(&buf, "%smcv", gotone ? ", " : "");
 					}
 
 					appendPQExpBuffer(&buf, ") ON %s FROM %s",
diff --git a/src/include/catalog/pg_cast.h b/src/include/catalog/pg_cast.h
index 1782753..4881134 100644
--- a/src/include/catalog/pg_cast.h
+++ b/src/include/catalog/pg_cast.h
@@ -262,6 +262,11 @@ DATA(insert (  3361  25    0 i i ));
 DATA(insert (  3402  17    0 i b ));
 DATA(insert (  3402  25    0 i i ));
 
+/* pg_mcv_list can be coerced to, but not from, bytea and text */
+DATA(insert (  441	 17    0 i b ));
+DATA(insert (  441	 25    0 i i ));
+
+
 /*
  * Datetime category
  */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 8b33b4e..d78ad54 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2786,6 +2786,18 @@ DESCR("I/O");
 DATA(insert OID = 3407 (  pg_dependencies_send	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 17 "3402" _null_ _null_ _null_ _null_ _null_ pg_dependencies_send _null_ _null_ _null_ ));
 DESCR("I/O");
 
+DATA(insert OID = 442 (  pg_mcv_list_in	PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 441 "2275" _null_ _null_ _null_ _null_ _null_ pg_mcv_list_in _null_ _null_ _null_ ));
+DESCR("I/O");
+DATA(insert OID = 443 (  pg_mcv_list_out	PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 2275 "441" _null_ _null_ _null_ _null_ _null_ pg_mcv_list_out _null_ _null_ _null_ ));
+DESCR("I/O");
+DATA(insert OID = 444 (  pg_mcv_list_recv	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 441 "2281" _null_ _null_ _null_ _null_ _null_ pg_mcv_list_recv _null_ _null_ _null_ ));
+DESCR("I/O");
+DATA(insert OID = 445 (  pg_mcv_list_send	PGNSP PGUID 12 1 0 0 0 f f f f t f s s 1 0 17 "441" _null_ _null_ _null_ _null_ _null_	pg_mcv_list_send _null_ _null_ _null_ ));
+DESCR("I/O");
+
+DATA(insert OID = 3410 (  pg_mcv_list_items PGNSP PGUID 12 1 1000 0 0 f f f f t t i s 1 0 2249 "26" "{26,23,1009,1000,701}" "{i,o,o,o,o}" "{oid,index,values,nulls,frequency}" _null_ _null_ pg_stats_ext_mcvlist_items _null_ _null_ _null_ ));
+DESCR("details about MCV list items");
+
 DATA(insert OID = 1928 (  pg_stat_get_numscans			PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_numscans _null_ _null_ _null_ ));
 DESCR("statistics: number of scans done for table/index");
 DATA(insert OID = 1929 (  pg_stat_get_tuples_returned	PGNSP PGUID 12 1 0 0 0 f f f f t f s r 1 0 20 "26" _null_ _null_ _null_ _null_ _null_ pg_stat_get_tuples_returned _null_ _null_ _null_ ));
diff --git a/src/include/catalog/pg_statistic_ext.h b/src/include/catalog/pg_statistic_ext.h
index 7813802..4752525 100644
--- a/src/include/catalog/pg_statistic_ext.h
+++ b/src/include/catalog/pg_statistic_ext.h
@@ -49,6 +49,7 @@ CATALOG(pg_statistic_ext,3381)
 												 * to build */
 	pg_ndistinct stxndistinct;	/* ndistinct coefficients (serialized) */
 	pg_dependencies stxdependencies;	/* dependencies (serialized) */
+	pg_mcv_list		stxmcv;		/* MCV (serialized) */
 #endif
 
 } FormData_pg_statistic_ext;
@@ -64,7 +65,7 @@ typedef FormData_pg_statistic_ext *Form_pg_statistic_ext;
  *		compiler constants for pg_statistic_ext
  * ----------------
  */
-#define Natts_pg_statistic_ext					8
+#define Natts_pg_statistic_ext					9
 #define Anum_pg_statistic_ext_stxrelid			1
 #define Anum_pg_statistic_ext_stxname			2
 #define Anum_pg_statistic_ext_stxnamespace		3
@@ -73,8 +74,10 @@ typedef FormData_pg_statistic_ext *Form_pg_statistic_ext;
 #define Anum_pg_statistic_ext_stxkind			6
 #define Anum_pg_statistic_ext_stxndistinct		7
 #define Anum_pg_statistic_ext_stxdependencies	8
+#define Anum_pg_statistic_ext_stxmcv			9
 
 #define STATS_EXT_NDISTINCT			'd'
 #define STATS_EXT_DEPENDENCIES		'f'
+#define STATS_EXT_MCV				'm'
 
 #endif							/* PG_STATISTIC_EXT_H */
diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h
index ffdb452..b5fcc3d 100644
--- a/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
@@ -372,6 +372,10 @@ DATA(insert OID = 3402 ( pg_dependencies		PGNSP PGUID -1 f b S f t \054 0 0 0 pg
 DESCR("multivariate dependencies");
 #define PGDEPENDENCIESOID	3402
 
+DATA(insert OID = 441 ( pg_mcv_list		PGNSP PGUID -1 f b S f t \054 0 0 0 pg_mcv_list_in pg_mcv_list_out pg_mcv_list_recv pg_mcv_list_send - - - i x f 0 -1 0 100 _null_ _null_ _null_ ));
+DESCR("multivariate MCV list");
+#define PGMCVLISTOID	441
+
 DATA(insert OID = 32 ( pg_ddl_command	PGNSP PGUID SIZEOF_POINTER t p P f t \054 0 0 0 pg_ddl_command_in pg_ddl_command_out pg_ddl_command_recv pg_ddl_command_send - - - ALIGNOF_POINTER p f 0 -1 0 0 _null_ _null_ _null_ ));
 DESCR("internal type for passing CollectedCommand");
 #define PGDDLCOMMANDOID 32
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index 738ff3f..7a04863 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -31,6 +31,15 @@ typedef struct
 	int			tupno;			/* position index for tuple it came from */
 } ScalarItem;
 
+/* (de)serialization info */
+typedef struct DimensionInfo
+{
+	int			nvalues;		/* number of deduplicated values */
+	int			nbytes;			/* number of bytes (serialized) */
+	int			typlen;			/* pg_type.typlen */
+	bool		typbyval;		/* pg_type.typbyval */
+} DimensionInfo;
+
 /* multi-sort */
 typedef struct MultiSortSupportData
 {
@@ -44,6 +53,7 @@ typedef struct SortItem
 {
 	Datum	   *values;
 	bool	   *isnull;
+	int			count;
 } SortItem;
 
 extern MVNDistinct *statext_ndistinct_build(double totalrows,
@@ -57,13 +67,35 @@ extern MVDependencies *statext_dependencies_build(int numrows, HeapTuple *rows,
 extern bytea *statext_dependencies_serialize(MVDependencies *dependencies);
 extern MVDependencies *statext_dependencies_deserialize(bytea *data);
 
+extern MCVList *statext_mcv_build(int numrows, HeapTuple *rows,
+					Bitmapset *attrs, VacAttrStats **stats);
+extern bytea *statext_mcv_serialize(MCVList *mcv, VacAttrStats **stats);
+extern MCVList *statext_mcv_deserialize(bytea *data);
+
 extern MultiSortSupport multi_sort_init(int ndims);
 extern void multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
 						 Oid oper);
-extern int	multi_sort_compare(const void *a, const void *b, void *arg);
+extern int multi_sort_compare(const void *a, const void *b, void *arg);
 extern int multi_sort_compare_dim(int dim, const SortItem *a,
 					   const SortItem *b, MultiSortSupport mss);
 extern int multi_sort_compare_dims(int start, int end, const SortItem *a,
 						const SortItem *b, MultiSortSupport mss);
+extern int compare_scalars_simple(const void *a, const void *b, void *arg);
+extern int compare_datums_simple(Datum a, Datum b, SortSupport ssup);
+
+extern void *bsearch_arg(const void *key, const void *base,
+			size_t nmemb, size_t size,
+			int (*compar) (const void *, const void *, void *),
+			void *arg);
+
+extern int *build_attnums(Bitmapset *attrs);
+
+extern SortItem * build_sorted_items(int numrows, HeapTuple *rows,
+				   TupleDesc tdesc, MultiSortSupport mss,
+				   int numattrs, int *attnums);
+
+extern int2vector *find_ext_attnums(Oid mvoid, Oid *relid);
+
+extern int bms_member_index(Bitmapset *keys, AttrNumber varattno);
 
 #endif							/* EXTENDED_STATS_INTERNAL_H */
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h
index 1d68c39..7b94dde 100644
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -15,6 +15,14 @@
 
 #include "commands/vacuum.h"
 #include "nodes/relation.h"
+ 
+/*
+ * Degree of how much MCV item matches a clause.
+ * This is then considered when computing the selectivity.
+ */
+#define STATS_MATCH_NONE		0		/* no match at all */
+#define STATS_MATCH_PARTIAL		1		/* partial match */
+#define STATS_MATCH_FULL		2		/* full match */
 
 #define STATS_MAX_DIMENSIONS	8	/* max number of attributes */
 
@@ -78,8 +86,40 @@ typedef struct MVDependencies
 /* size of the struct excluding the deps array */
 #define SizeOfDependencies	(offsetof(MVDependencies, ndeps) + sizeof(uint32))
 
+ 
+/* used to flag stats serialized to bytea */
+#define STATS_MCV_MAGIC                        0xE1A651C2              /* marks serialized bytea */
+#define STATS_MCV_TYPE_BASIC   1                               /* basic MCV list type */
+
+/* max items in MCV list (mostly arbitrary number) */
+#define STATS_MCVLIST_MAX_ITEMS        8192
+
+/*
+ * Multivariate MCV (most-common value) lists
+ *
+ * A straight-forward extension of MCV items - i.e. a list (array) of
+ * combinations of attribute values, together with a frequency and null flags.
+ */
+typedef struct MCVItem
+{
+	double		frequency;	/* frequency of this combination */
+	bool	   *isnull;		/* lags of NULL values (up to 32 columns) */
+	Datum	   *values;		/* variable-length (ndimensions) */
+} MCVItem;
+
+/* multivariate MCV list - essentally an array of MCV items */
+typedef struct MCVList
+{
+	uint32		magic;		/* magic constant marker */
+	uint32		type;		/* type of MCV list (BASIC) */
+	uint32		nitems;		/* number of MCV items in the array */
+	AttrNumber	ndimensions;	/* number of dimensions */
+	MCVItem	  **items;		/* array of MCV items */
+} MCVList;
+
 extern MVNDistinct *statext_ndistinct_load(Oid mvoid);
 extern MVDependencies *statext_dependencies_load(Oid mvoid);
+extern MCVList *statext_mcv_load(Oid mvoid);
 
 extern void BuildRelationExtStatistics(Relation onerel, double totalrows,
 						   int numrows, HeapTuple *rows,
@@ -92,6 +132,13 @@ extern Selectivity dependencies_clauselist_selectivity(PlannerInfo *root,
 									SpecialJoinInfo *sjinfo,
 									RelOptInfo *rel,
 									Bitmapset **estimatedclauses);
+extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root,
+									List *clauses,
+									int varRelid,
+									JoinType jointype,
+									SpecialJoinInfo *sjinfo,
+									RelOptInfo *rel,
+									Bitmapset **estimatedclauses);
 extern bool has_stats_of_kind(List *stats, char requiredkind);
 extern StatisticExtInfo *choose_best_statistics(List *stats,
 					   Bitmapset *attnums, char requiredkind);
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index fcf8bd7..bdc0889 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -859,11 +859,12 @@ WHERE c.castmethod = 'b' AND
  pg_node_tree      | text              |        0 | i
  pg_ndistinct      | bytea             |        0 | i
  pg_dependencies   | bytea             |        0 | i
+ pg_mcv_list       | bytea             |        0 | i
  cidr              | inet              |        0 | i
  xml               | text              |        0 | a
  xml               | character varying |        0 | a
  xml               | character         |        0 | a
-(9 rows)
+(10 rows)
 
 -- **************** pg_conversion ****************
 -- Look for illegal values in pg_conversion fields.
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 441cfaa..85009d2 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -58,7 +58,7 @@ ALTER TABLE ab1 DROP COLUMN a;
  b      | integer |           |          | 
  c      | integer |           |          | 
 Statistics objects:
-    "public"."ab1_b_c_stats" (ndistinct, dependencies) ON b, c FROM ab1
+    "public"."ab1_b_c_stats" (ndistinct, dependencies, mcv) ON b, c FROM ab1
 
 -- Ensure statistics are dropped when table is
 SELECT stxname FROM pg_statistic_ext WHERE stxname LIKE 'ab1%';
@@ -206,7 +206,7 @@ SELECT stxkind, stxndistinct
   FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass;
  stxkind |                      stxndistinct                       
 ---------+---------------------------------------------------------
- {d,f}   | {"3, 4": 301, "3, 6": 301, "4, 6": 301, "3, 4, 6": 301}
+ {d,f,m} | {"3, 4": 301, "3, 6": 301, "4, 6": 301, "3, 4, 6": 301}
 (1 row)
 
 -- Hash Aggregate, thanks to estimates improved by the statistic
@@ -272,7 +272,7 @@ SELECT stxkind, stxndistinct
   FROM pg_statistic_ext WHERE stxrelid = 'ndistinct'::regclass;
  stxkind |                        stxndistinct                         
 ---------+-------------------------------------------------------------
- {d,f}   | {"3, 4": 2550, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 10000}
+ {d,f,m} | {"3, 4": 2550, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 10000}
 (1 row)
 
 -- plans using Group Aggregate, thanks to using correct esimates
@@ -509,3 +509,216 @@ EXPLAIN (COSTS OFF)
 (5 rows)
 
 RESET random_page_cost;
+-- MCV lists
+CREATE TABLE mcv_lists (
+    filler1 TEXT,
+    filler2 NUMERIC,
+    a INT,
+    b TEXT,
+    filler3 DATE,
+    c INT,
+    d TEXT
+);
+SET random_page_cost = 1.2;
+CREATE INDEX mcv_lists_ab_idx ON mcv_lists (a, b);
+CREATE INDEX mcv_lists_abc_idx ON mcv_lists (a, b, c);
+-- random data (no MCV list)
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT mod(i,37), mod(i,41), mod(i,43), mod(i,47) FROM generate_series(1,5000) s(i);
+ANALYZE mcv_lists;
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+                    QUERY PLAN                     
+---------------------------------------------------
+ Bitmap Heap Scan on mcv_lists
+   Recheck Cond: ((a = 1) AND (b = '1'::text))
+   ->  Bitmap Index Scan on mcv_lists_abc_idx
+         Index Cond: ((a = 1) AND (b = '1'::text))
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1' AND c = 1;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Index Scan using mcv_lists_abc_idx on mcv_lists
+   Index Cond: ((a = 1) AND (b = '1'::text) AND (c = 1))
+(2 rows)
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+ANALYZE mcv_lists;
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+                    QUERY PLAN                     
+---------------------------------------------------
+ Bitmap Heap Scan on mcv_lists
+   Recheck Cond: ((a = 1) AND (b = '1'::text))
+   ->  Bitmap Index Scan on mcv_lists_abc_idx
+         Index Cond: ((a = 1) AND (b = '1'::text))
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1' AND c = 1;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Index Scan using mcv_lists_abc_idx on mcv_lists
+   Index Cond: ((a = 1) AND (b = '1'::text) AND (c = 1))
+(2 rows)
+
+-- 100 distinct combinations, all in the MCV list
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
+ANALYZE mcv_lists;
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+                   QUERY PLAN                    
+-------------------------------------------------
+ Index Scan using mcv_lists_abc_idx on mcv_lists
+   Index Cond: ((a = 1) AND (b = '1'::text))
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a < 1 AND b < '1';
+                   QUERY PLAN                    
+-------------------------------------------------
+ Index Scan using mcv_lists_abc_idx on mcv_lists
+   Index Cond: ((a < 1) AND (b < '1'::text))
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1' AND c = 1;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Index Scan using mcv_lists_abc_idx on mcv_lists
+   Index Cond: ((a = 1) AND (b = '1'::text) AND (c = 1))
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a < 5 AND b < '1' AND c < 5;
+                       QUERY PLAN                        
+---------------------------------------------------------
+ Index Scan using mcv_lists_abc_idx on mcv_lists
+   Index Cond: ((a < 5) AND (b < '1'::text) AND (c < 5))
+(2 rows)
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+ANALYZE mcv_lists;
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+                    QUERY PLAN                     
+---------------------------------------------------
+ Bitmap Heap Scan on mcv_lists
+   Recheck Cond: ((a = 1) AND (b = '1'::text))
+   ->  Bitmap Index Scan on mcv_lists_abc_idx
+         Index Cond: ((a = 1) AND (b = '1'::text))
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a < 1 AND b < '1';
+                    QUERY PLAN                     
+---------------------------------------------------
+ Bitmap Heap Scan on mcv_lists
+   Recheck Cond: ((a < 1) AND (b < '1'::text))
+   ->  Bitmap Index Scan on mcv_lists_abc_idx
+         Index Cond: ((a < 1) AND (b < '1'::text))
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1' AND c = 1;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Bitmap Heap Scan on mcv_lists
+   Recheck Cond: ((a = 1) AND (b = '1'::text))
+   Filter: (c = 1)
+   ->  Bitmap Index Scan on mcv_lists_ab_idx
+         Index Cond: ((a = 1) AND (b = '1'::text))
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a < 5 AND b < '1' AND c < 5;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Bitmap Heap Scan on mcv_lists
+   Recheck Cond: ((a < 5) AND (b < '1'::text))
+   Filter: (c < 5)
+   ->  Bitmap Index Scan on mcv_lists_ab_idx
+         Index Cond: ((a < 5) AND (b < '1'::text))
+(5 rows)
+
+-- check change of column type resets the MCV statistics
+ALTER TABLE mcv_lists ALTER COLUMN c TYPE numeric;
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+                   QUERY PLAN                    
+-------------------------------------------------
+ Index Scan using mcv_lists_abc_idx on mcv_lists
+   Index Cond: ((a = 1) AND (b = '1'::text))
+(2 rows)
+
+ANALYZE mcv_lists;
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+                    QUERY PLAN                     
+---------------------------------------------------
+ Bitmap Heap Scan on mcv_lists
+   Recheck Cond: ((a = 1) AND (b = '1'::text))
+   ->  Bitmap Index Scan on mcv_lists_abc_idx
+         Index Cond: ((a = 1) AND (b = '1'::text))
+(4 rows)
+
+-- 100 distinct combinations with NULL values, all in the MCV list
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT
+         (CASE WHEN mod(i,100) = 1 THEN NULL ELSE mod(i,100) END),
+         (CASE WHEN mod(i,50) = 1  THEN NULL ELSE mod(i,50) END),
+         (CASE WHEN mod(i,25) = 1  THEN NULL ELSE mod(i,25) END),
+         i
+     FROM generate_series(1,5000) s(i);
+ANALYZE mcv_lists;
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL;
+                   QUERY PLAN                    
+-------------------------------------------------
+ Index Scan using mcv_lists_abc_idx on mcv_lists
+   Index Cond: ((a IS NULL) AND (b IS NULL))
+(2 rows)
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL AND c IS NULL;
+                   QUERY PLAN                   
+------------------------------------------------
+ Index Scan using mcv_lists_ab_idx on mcv_lists
+   Index Cond: ((a IS NULL) AND (b IS NULL))
+   Filter: (c IS NULL)
+(3 rows)
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+ANALYZE mcv_lists;
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Bitmap Heap Scan on mcv_lists
+   Recheck Cond: ((a IS NULL) AND (b IS NULL))
+   ->  Bitmap Index Scan on mcv_lists_abc_idx
+         Index Cond: ((a IS NULL) AND (b IS NULL))
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL AND c IS NULL;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Bitmap Heap Scan on mcv_lists
+   Recheck Cond: ((a IS NULL) AND (b IS NULL))
+   Filter: (c IS NULL)
+   ->  Bitmap Index Scan on mcv_lists_ab_idx
+         Index Cond: ((a IS NULL) AND (b IS NULL))
+(5 rows)
+
+RESET random_page_cost;
diff --git a/src/test/regress/expected/type_sanity.out b/src/test/regress/expected/type_sanity.out
index 7b200ba..5a7c570 100644
--- a/src/test/regress/expected/type_sanity.out
+++ b/src/test/regress/expected/type_sanity.out
@@ -72,8 +72,9 @@ WHERE p1.typtype not in ('c','d','p') AND p1.typname NOT LIKE E'\\_%'
   194 | pg_node_tree
  3361 | pg_ndistinct
  3402 | pg_dependencies
+  441 | pg_mcv_list
   210 | smgr
-(4 rows)
+(5 rows)
 
 -- Make sure typarray points to a varlena array type of our own base
 SELECT p1.oid, p1.typname as basetype, p2.typname as arraytype,
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 46acaad..e9902ce 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -282,3 +282,124 @@ EXPLAIN (COSTS OFF)
  SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1;
 
 RESET random_page_cost;
+
+-- MCV lists
+CREATE TABLE mcv_lists (
+    filler1 TEXT,
+    filler2 NUMERIC,
+    a INT,
+    b TEXT,
+    filler3 DATE,
+    c INT,
+    d TEXT
+);
+
+SET random_page_cost = 1.2;
+
+CREATE INDEX mcv_lists_ab_idx ON mcv_lists (a, b);
+CREATE INDEX mcv_lists_abc_idx ON mcv_lists (a, b, c);
+
+-- random data (no MCV list)
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT mod(i,37), mod(i,41), mod(i,43), mod(i,47) FROM generate_series(1,5000) s(i);
+
+ANALYZE mcv_lists;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1' AND c = 1;
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1' AND c = 1;
+
+-- 100 distinct combinations, all in the MCV list
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT mod(i,100), mod(i,50), mod(i,25), i FROM generate_series(1,5000) s(i);
+
+ANALYZE mcv_lists;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a < 1 AND b < '1';
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1' AND c = 1;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a < 5 AND b < '1' AND c < 5;
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a < 1 AND b < '1';
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1' AND c = 1;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a < 5 AND b < '1' AND c < 5;
+
+-- check change of column type resets the MCV statistics
+ALTER TABLE mcv_lists ALTER COLUMN c TYPE numeric;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+
+ANALYZE mcv_lists;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a = 1 AND b = '1';
+
+-- 100 distinct combinations with NULL values, all in the MCV list
+TRUNCATE mcv_lists;
+DROP STATISTICS mcv_lists_stats;
+
+INSERT INTO mcv_lists (a, b, c, filler1)
+     SELECT
+         (CASE WHEN mod(i,100) = 1 THEN NULL ELSE mod(i,100) END),
+         (CASE WHEN mod(i,50) = 1  THEN NULL ELSE mod(i,50) END),
+         (CASE WHEN mod(i,25) = 1  THEN NULL ELSE mod(i,25) END),
+         i
+     FROM generate_series(1,5000) s(i);
+
+ANALYZE mcv_lists;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL AND c IS NULL;
+
+-- create statistics
+CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, c FROM mcv_lists;
+
+ANALYZE mcv_lists;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL;
+
+EXPLAIN (COSTS OFF)
+ SELECT * FROM mcv_lists WHERE a IS NULL AND b IS NULL AND c IS NULL;
+
+RESET random_page_cost;
-- 
2.9.4

