>From 7c8f0ce0017beea314219c24146cbb64d0d37a3d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tv@fuzzy.cz>
Date: Sun, 11 Jan 2015 19:51:48 +0100
Subject: [PATCH 1/5] shared infrastructure and functional dependencies

Basic infrastructure shared by all kinds of multivariate
stats, most importantly:

- adds a new system catalog (pg_mv_statistic)
- ALTER TABLE ... ADD STATISTICS syntax
- implementation of functional dependencies (the simplest
  type of multivariate statistics)
- building functional dependencies in ANALYZE
- updates regression tests (new catalog etc.)

This does not include any changes to the optimizer, i.e.
it does not influence the query planning (subject to
follow-up patches).

The current implementation requires a valid 'ltopr' for
the columns, so that we can sort the sample rows in various
ways, both in this patch and other kinds of statistics.
Maybe this restriction could be relaxed in the future,
requiring just 'eqopr' in case of stats not sorting the
data (e.g. functional dependencies and MCV lists).

The algorithm detecting the dependencies is rather simple
and probably needs improvements.

The name 'functional dependencies' is more correct (than
'association rules') as it's exactly the name used in
relational theory (esp. Normal Forms) for tracking
column-level dependencies.
---
 src/backend/catalog/Makefile               |   1 +
 src/backend/catalog/system_views.sql       |  10 +
 src/backend/commands/analyze.c             |  20 +-
 src/backend/commands/tablecmds.c           | 149 ++++++-
 src/backend/nodes/copyfuncs.c              |  14 +
 src/backend/parser/gram.y                  |  67 ++-
 src/backend/utils/Makefile                 |   2 +-
 src/backend/utils/cache/syscache.c         |  12 +
 src/backend/utils/mvstats/Makefile         |  17 +
 src/backend/utils/mvstats/common.c         | 342 +++++++++++++++
 src/backend/utils/mvstats/common.h         |  75 ++++
 src/backend/utils/mvstats/dependencies.c   | 680 +++++++++++++++++++++++++++++
 src/include/catalog/indexing.h             |   5 +
 src/include/catalog/pg_mv_statistic.h      |  69 +++
 src/include/catalog/pg_proc.h              |   5 +
 src/include/catalog/toasting.h             |   1 +
 src/include/nodes/nodes.h                  |   1 +
 src/include/nodes/parsenodes.h             |  11 +-
 src/include/utils/mvstats.h                |  86 ++++
 src/include/utils/syscache.h               |   1 +
 src/test/regress/expected/rules.out        |   8 +
 src/test/regress/expected/sanity_check.out |   1 +
 22 files changed, 1569 insertions(+), 8 deletions(-)
 create mode 100644 src/backend/utils/mvstats/Makefile
 create mode 100644 src/backend/utils/mvstats/common.c
 create mode 100644 src/backend/utils/mvstats/common.h
 create mode 100644 src/backend/utils/mvstats/dependencies.c
 create mode 100644 src/include/catalog/pg_mv_statistic.h
 create mode 100644 src/include/utils/mvstats.h

diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index a403c64..d6c16f8 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -32,6 +32,7 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\
 	pg_attrdef.h pg_constraint.h pg_inherits.h pg_index.h pg_operator.h \
 	pg_opfamily.h pg_opclass.h pg_am.h pg_amop.h pg_amproc.h \
 	pg_language.h pg_largeobject_metadata.h pg_largeobject.h pg_aggregate.h \
+	pg_mv_statistic.h \
 	pg_statistic.h pg_rewrite.h pg_trigger.h pg_event_trigger.h pg_description.h \
 	pg_cast.h pg_enum.h pg_namespace.h pg_conversion.h pg_depend.h \
 	pg_database.h pg_db_role_setting.h pg_tablespace.h pg_pltemplate.h \
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 2800f73..d05a716 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -150,6 +150,16 @@ CREATE VIEW pg_indexes AS
          LEFT JOIN pg_tablespace T ON (T.oid = I.reltablespace)
     WHERE C.relkind IN ('r', 'm') AND I.relkind = 'i';
 
+CREATE VIEW pg_mv_stats AS
+    SELECT
+        N.nspname AS schemaname,
+        C.relname AS tablename,
+        S.stakeys AS attnums,
+        length(S.stadeps) as depsbytes,
+        pg_mv_stats_dependencies_info(S.stadeps) as depsinfo
+    FROM (pg_mv_statistic S JOIN pg_class C ON (C.oid = S.starelid))
+        LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace);
+
 CREATE VIEW pg_stats AS
     SELECT
         nspname AS schemaname,
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index d4d1914..f82fcf5 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -27,6 +27,7 @@
 #include "catalog/indexing.h"
 #include "catalog/pg_collation.h"
 #include "catalog/pg_inherits_fn.h"
+#include "catalog/pg_mv_statistic.h"
 #include "catalog/pg_namespace.h"
 #include "commands/dbcommands.h"
 #include "commands/tablecmds.h"
@@ -54,7 +55,11 @@
 #include "utils/syscache.h"
 #include "utils/timestamp.h"
 #include "utils/tqual.h"
+#include "utils/fmgroids.h"
+#include "utils/builtins.h"
 
+#include "utils/mvstats.h"
+#include "access/sysattr.h"
 
 /* Data structure for Algorithm S from Knuth 3.4.2 */
 typedef struct
@@ -110,7 +115,6 @@ static void update_attstats(Oid relid, bool inh,
 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 
-
 /*
  *	analyze_rel() -- analyze one relation
  */
@@ -471,6 +475,17 @@ do_analyze_rel(Relation onerel, int options, List *va_cols,
 	 * all analyzable columns.  We use a lower bound of 100 rows to avoid
 	 * possible overflow in Vitter's algorithm.  (Note: that will also be the
 	 * target in the corner case where there are no analyzable columns.)
+	 *
+	 * FIXME This sample sizing is mostly OK when computing stats for
+	 *       individual columns, but when computing multi-variate stats
+	 *       for multivariate stats (histograms, mcv, ...) it's rather
+	 *       insufficient. For stats on multiple columns / complex stats
+	 *       we need larger sample sizes, and in some cases samples
+	 *       proportional to the table (say, 0.5% - 1%) instead of a
+	 *       fixed size might be more appropriate. Also, this should be
+	 *       bound to the requested statistics size - e.g. number of MCV
+	 *       items or histogram buckets should require several sample
+	 *       rows per item/bucket (so the sample should be k*size).
 	 */
 	targrows = 100;
 	for (i = 0; i < attr_cnt; i++)
@@ -573,6 +588,9 @@ do_analyze_rel(Relation onerel, int options, List *va_cols,
 			update_attstats(RelationGetRelid(Irel[ind]), false,
 							thisdata->attr_cnt, thisdata->vacattrstats);
 		}
+
+		/* Build multivariate stats (if there are any). */
+		build_mv_stats(onerel, numrows, rows, attr_cnt, vacattrstats);
 	}
 
 	/*
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 002319e..a321755 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -35,6 +35,7 @@
 #include "catalog/pg_foreign_table.h"
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_inherits_fn.h"
+#include "catalog/pg_mv_statistic.h"
 #include "catalog/pg_namespace.h"
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_tablespace.h"
@@ -92,7 +93,7 @@
 #include "utils/syscache.h"
 #include "utils/tqual.h"
 #include "utils/typcache.h"
-
+#include "utils/mvstats.h"
 
 /*
  * ON COMMIT action list
@@ -140,8 +141,9 @@ static List *on_commits = NIL;
 #define AT_PASS_ADD_COL			5		/* ADD COLUMN */
 #define AT_PASS_ADD_INDEX		6		/* ADD indexes */
 #define AT_PASS_ADD_CONSTR		7		/* ADD constraints, defaults */
-#define AT_PASS_MISC			8		/* other stuff */
-#define AT_NUM_PASSES			9
+#define AT_PASS_ADD_STATS		8		/* ADD statistics */
+#define AT_PASS_MISC			9		/* other stuff */
+#define AT_NUM_PASSES			10
 
 typedef struct AlteredTableInfo
 {
@@ -416,7 +418,8 @@ static void ATExecReplicaIdentity(Relation rel, ReplicaIdentityStmt *stmt, LOCKM
 static void ATExecGenericOptions(Relation rel, List *options);
 static void ATExecEnableRowSecurity(Relation rel);
 static void ATExecDisableRowSecurity(Relation rel);
-
+static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+								StatisticsDef *def, LOCKMODE lockmode);
 static void copy_relation_data(SMgrRelation rel, SMgrRelation dst,
 				   ForkNumber forkNum, char relpersistence);
 static const char *storage_name(char c);
@@ -2999,6 +3002,7 @@ AlterTableGetLockLevel(List *cmds)
 				 * updates.
 				 */
 			case AT_SetStatistics:		/* Uses MVCC in getTableAttrs() */
+			case AT_AddStatistics:		/* XXX not sure if the right level */
 			case AT_ClusterOn:	/* Uses MVCC in getIndexes() */
 			case AT_DropCluster:		/* Uses MVCC in getIndexes() */
 			case AT_SetOptions:	/* Uses MVCC in getTableAttrs() */
@@ -3155,6 +3159,7 @@ ATPrepCmd(List **wqueue, Relation rel, AlterTableCmd *cmd,
 			pass = AT_PASS_ADD_CONSTR;
 			break;
 		case AT_SetStatistics:	/* ALTER COLUMN SET STATISTICS */
+		case AT_AddStatistics:	/* XXX maybe not the right place */
 			ATSimpleRecursion(wqueue, rel, cmd, recurse, lockmode);
 			/* Performs own permission checks */
 			ATPrepSetStatistics(rel, cmd->name, cmd->def, lockmode);
@@ -3457,6 +3462,9 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
 		case AT_SetStatistics:	/* ALTER COLUMN SET STATISTICS */
 			address = ATExecSetStatistics(rel, cmd->name, cmd->def, lockmode);
 			break;
+		case AT_AddStatistics:		/* ADD STATISTICS */
+			ATExecAddStatistics(tab, rel, (StatisticsDef *) cmd->def, lockmode);
+			break;
 		case AT_SetOptions:		/* ALTER COLUMN SET ( options ) */
 			address = ATExecSetOptions(rel, cmd->name, cmd->def, false, lockmode);
 			break;
@@ -11854,3 +11862,136 @@ RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid, Oid oldrelid,
 
 	ReleaseSysCache(tuple);
 }
+
+/* used for sorting the attnums in ATExecAddStatistics */
+static int compare_int16(const void *a, const void *b)
+{
+	return memcmp(a, b, sizeof(int16));
+}
+
+/*
+ * Implements the ALTER TABLE ... ADD STATISTICS (options) ON (columns).
+ *
+ * The code is an unholy mix of pieces that really belong to other parts
+ * of the source tree.
+ *
+ * FIXME Check that the types are pass-by-value and support sort,
+ *       although maybe we can live without the sort (and only build
+ *       MCV list / association rules).
+ *
+ * FIXME This should probably check for duplicate stats (i.e. same
+ *       keys, same options). Although maybe it's useful to have
+ *       multiple stats on the same columns with different options
+ *       (say, a detailed MCV-only stats for some queries, histogram
+ *       for others, etc.)
+ */
+static void ATExecAddStatistics(AlteredTableInfo *tab, Relation rel,
+						StatisticsDef *def, LOCKMODE lockmode)
+{
+	int			i, j;
+	ListCell   *l;
+	int16		attnums[INDEX_MAX_KEYS];
+	int			numcols = 0;
+
+	HeapTuple	htup;
+	Datum		values[Natts_pg_mv_statistic];
+	bool		nulls[Natts_pg_mv_statistic];
+	int2vector *stakeys;
+	Relation	mvstatrel;
+
+	/* by default build everything */
+	bool 	build_dependencies = true;
+
+	Assert(IsA(def, StatisticsDef));
+
+	/* transform the column names to attnum values */
+
+	foreach(l, def->keys)
+	{
+		char	   *attname = strVal(lfirst(l));
+		HeapTuple	atttuple;
+
+		atttuple = SearchSysCacheAttName(RelationGetRelid(rel), attname);
+
+		if (!HeapTupleIsValid(atttuple))
+			ereport(ERROR,
+					(errcode(ERRCODE_UNDEFINED_COLUMN),
+					 errmsg("column \"%s\" referenced in statistics does not exist",
+							attname)));
+
+		/* more than MVHIST_MAX_DIMENSIONS columns not allowed */
+		if (numcols >= MVSTATS_MAX_DIMENSIONS)
+			ereport(ERROR,
+					(errcode(ERRCODE_TOO_MANY_COLUMNS),
+					 errmsg("cannot have more than %d keys in a statistics",
+							MVSTATS_MAX_DIMENSIONS)));
+
+		attnums[numcols] = ((Form_pg_attribute) GETSTRUCT(atttuple))->attnum;
+		ReleaseSysCache(atttuple);
+		numcols++;
+	}
+
+	/*
+	 * Check the lower bound (at least 2 columns), the upper bound was
+	 * already checked in the loop.
+	 */
+	if (numcols < 2)
+			ereport(ERROR,
+					(errcode(ERRCODE_TOO_MANY_COLUMNS),
+					 errmsg("multivariate stats require 2 or more columns")));
+
+	/* look for duplicities */
+	for (i = 0; i < numcols; i++)
+		for (j = 0; j < numcols; j++)
+			if ((i != j) && (attnums[i] == attnums[j]))
+				ereport(ERROR,
+						(errcode(ERRCODE_UNDEFINED_COLUMN),
+						 errmsg("duplicate column name in statistics definition")));
+
+	/* parse the statistics options */
+	foreach (l, def->options)
+	{
+		DefElem *opt = (DefElem*)lfirst(l);
+
+		if (strcmp(opt->defname, "dependencies") == 0)
+			build_dependencies = defGetBoolean(opt);
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_SYNTAX_ERROR),
+					 errmsg("unrecognized STATISTICS option \"%s\"",
+							opt->defname)));
+	}
+
+	/* sort the attnums and build int2vector */
+	qsort(attnums, numcols, sizeof(int16), compare_int16);
+	stakeys = buildint2vector(attnums, numcols);
+
+	/*
+	 * Okay, let's create the pg_mv_statistic entry.
+	 */
+	memset(values, 0, sizeof(values));
+	memset(nulls, false, sizeof(nulls));
+
+	/* no stats collected yet, so just the keys */
+	values[Anum_pg_mv_statistic_starelid-1] = ObjectIdGetDatum(RelationGetRelid(rel));
+
+	values[Anum_pg_mv_statistic_stakeys -1] = PointerGetDatum(stakeys);
+	values[Anum_pg_mv_statistic_deps_enabled -1] = BoolGetDatum(build_dependencies);
+
+	nulls[Anum_pg_mv_statistic_stadeps -1] = true;
+
+	/* insert the tuple into pg_mv_statistic */
+	mvstatrel = heap_open(MvStatisticRelationId, RowExclusiveLock);
+
+	htup = heap_form_tuple(mvstatrel->rd_att, values, nulls);
+
+	simple_heap_insert(mvstatrel, htup);
+
+	CatalogUpdateIndexes(mvstatrel, htup);
+
+	heap_freetuple(htup);
+
+	heap_close(mvstatrel, RowExclusiveLock);
+
+	return;
+}
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 029761e..a4ce2c9 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -3918,6 +3918,17 @@ _copyAlterPolicyStmt(const AlterPolicyStmt *from)
 	return newnode;
 }
 
+static StatisticsDef *
+_copyStatisticsDef(const StatisticsDef *from)
+{
+	StatisticsDef  *newnode = makeNode(StatisticsDef);
+
+	COPY_NODE_FIELD(keys);
+	COPY_NODE_FIELD(options);
+
+	return newnode;
+}
+
 /* ****************************************************************
  *					pg_list.h copy functions
  * ****************************************************************
@@ -4732,6 +4743,9 @@ copyObject(const void *from)
 		case T_CommonTableExpr:
 			retval = _copyCommonTableExpr(from);
 			break;
+		case T_StatisticsDef:
+			retval = _copyStatisticsDef(from);
+			break;
 		case T_FuncWithArgs:
 			retval = _copyFuncWithArgs(from);
 			break;
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 3aa9e42..17183ef 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -367,6 +367,13 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 				create_generic_options alter_generic_options
 				relation_expr_list dostmt_opt_list
 
+%type <list>	OptStatsOptions
+%type <str>		stats_options_name
+%type <node>	stats_options_arg
+%type <defelt>	stats_options_elem
+%type <list>	stats_options_list
+
+
 %type <list>	opt_fdw_options fdw_options
 %type <defelt>	fdw_option
 
@@ -486,7 +493,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <keyword> unreserved_keyword type_func_name_keyword
 %type <keyword> col_name_keyword reserved_keyword
 
-%type <node>	TableConstraint TableLikeClause
+%type <node>	TableConstraint TableLikeClause TableStatistics
 %type <ival>	TableLikeOptionList TableLikeOption
 %type <list>	ColQualList
 %type <node>	ColConstraint ColConstraintElem ConstraintAttr
@@ -2311,6 +2318,14 @@ alter_table_cmd:
 					n->subtype = AT_DisableRowSecurity;
 					$$ = (Node *)n;
 				}
+			/* ALTER TABLE <name> ADD STATISTICS (options) ON (columns) ... */
+			| ADD_P TableStatistics
+				{
+					AlterTableCmd *n = makeNode(AlterTableCmd);
+					n->subtype = AT_AddStatistics;
+					n->def = $2;
+					$$ = (Node *)n;
+				}
 			| alter_generic_options
 				{
 					AlterTableCmd *n = makeNode(AlterTableCmd);
@@ -3385,6 +3400,56 @@ OptConsTableSpace:   USING INDEX TABLESPACE name	{ $$ = $4; }
 ExistingIndex:   USING INDEX index_name				{ $$ = $3; }
 		;
 
+/*****************************************************************************
+ *
+ *		QUERY :
+ *				ALTER TABLE relname ADD STATISTICS (columns) WITH (options)
+ *
+ *****************************************************************************/
+
+TableStatistics:
+			STATISTICS OptStatsOptions ON '(' columnList ')'
+				{
+					StatisticsDef *n = makeNode(StatisticsDef);
+					n->keys  = $5;
+					n->options  = $2;
+					$$ = (Node *) n;
+				}
+		;
+
+OptStatsOptions:
+			'(' stats_options_list ')'		{ $$ = $2; }
+			| /*EMPTY*/						{ $$ = NIL; }
+		;
+
+stats_options_list:
+			stats_options_elem
+				{
+					$$ = list_make1($1);
+				}
+			| stats_options_list ',' stats_options_elem
+				{
+					$$ = lappend($1, $3);
+				}
+		;
+
+stats_options_elem:
+			stats_options_name stats_options_arg
+				{
+					$$ = makeDefElem($1, $2);
+				}
+		;
+
+stats_options_name:
+			NonReservedWord			{ $$ = $1; }
+		;
+
+stats_options_arg:
+			opt_boolean_or_string	{ $$ = (Node *) makeString($1); }
+			| NumericOnly			{ $$ = (Node *) $1; }
+			| /* EMPTY */			{ $$ = NULL; }
+		;
+
 
 /*****************************************************************************
  *
diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile
index 8374533..eba0352 100644
--- a/src/backend/utils/Makefile
+++ b/src/backend/utils/Makefile
@@ -9,7 +9,7 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS        = fmgrtab.o
-SUBDIRS     = adt cache error fmgr hash init mb misc mmgr resowner sort time
+SUBDIRS     = adt cache error fmgr hash init mb misc mmgr mvstats resowner sort time
 
 # location of Catalog.pm
 catalogdir  = $(top_srcdir)/src/backend/catalog
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index bd27168..f61ef7e 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -43,6 +43,7 @@
 #include "catalog/pg_foreign_server.h"
 #include "catalog/pg_foreign_table.h"
 #include "catalog/pg_language.h"
+#include "catalog/pg_mv_statistic.h"
 #include "catalog/pg_namespace.h"
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_operator.h"
@@ -499,6 +500,17 @@ static const struct cachedesc cacheinfo[] = {
 		},
 		4
 	},
+	{MvStatisticRelationId,		/* MVSTATOID */
+		MvStatisticOidIndexId,
+		1,
+		{
+			ObjectIdAttributeNumber,
+			0,
+			0,
+			0
+		},
+		128
+	},
 	{NamespaceRelationId,		/* NAMESPACENAME */
 		NamespaceNameIndexId,
 		1,
diff --git a/src/backend/utils/mvstats/Makefile b/src/backend/utils/mvstats/Makefile
new file mode 100644
index 0000000..099f1ed
--- /dev/null
+++ b/src/backend/utils/mvstats/Makefile
@@ -0,0 +1,17 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for utils/mvstats
+#
+# IDENTIFICATION
+#    src/backend/utils/mvstats/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/utils/mvstats
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = common.o dependencies.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mvstats/common.c b/src/backend/utils/mvstats/common.c
new file mode 100644
index 0000000..8efc5ba
--- /dev/null
+++ b/src/backend/utils/mvstats/common.c
@@ -0,0 +1,342 @@
+/*-------------------------------------------------------------------------
+ *
+ * common.c
+ *	  POSTGRES multivariate statistics
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mvstats/common.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "common.h"
+
+static VacAttrStats ** lookup_var_attr_stats(int2vector *attrs,
+									  int natts, VacAttrStats **vacattrstats);
+
+
+/*
+ * Compute requested multivariate stats, using the rows sampled for the
+ * plain (single-column) stats.
+ *
+ * This fetches a list of stats from pg_mv_statistic, computes the stats
+ * and serializes them back into the catalog (as bytea values).
+ */
+void
+build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
+			   int natts, VacAttrStats **vacattrstats)
+{
+	int i;
+	MVStats mvstats;
+	int		nmvstats;
+
+	/*
+	 * Fetch defined MV groups from pg_mv_statistic, and then compute
+	 * the MV statistics (histograms for now).
+	 */
+	mvstats = list_mv_stats(RelationGetRelid(onerel), &nmvstats, false);
+
+	for (i = 0; i < nmvstats; i++)
+	{
+		MVDependencies	deps  = NULL;
+
+		/* int2 vector of attnums the stats should be computed on */
+		int2vector * attrs = mvstats[i].stakeys;
+ 
+		/* filter only the interesting vacattrstats records */
+		VacAttrStats **stats = lookup_var_attr_stats(attrs, natts, vacattrstats);
+
+		/* check allowed number of dimensions */
+		Assert((attrs->dim1 >= 2) && (attrs->dim1 <= MVSTATS_MAX_DIMENSIONS));
+
+		/*
+		 * Analyze functional dependencies of columns.
+		 */
+		deps = build_mv_dependencies(numrows, rows, attrs, stats);
+
+		/* store the histogram / MCV list in the catalog */
+		update_mv_stats(mvstats[i].mvoid, deps);
+	}
+}
+
+/*
+ * Lookup the VacAttrStats info for the selected columns, with indexes
+ * matching the attrs vector (to make it easy to work with when
+ * computing multivariate stats).
+ */
+static VacAttrStats **
+lookup_var_attr_stats(int2vector *attrs, int natts, VacAttrStats **vacattrstats)
+{
+	int i, j;
+	int numattrs = attrs->dim1;
+	VacAttrStats **stats = (VacAttrStats**)palloc0(numattrs * sizeof(VacAttrStats*));
+
+	/* lookup VacAttrStats info for the requested columns (same attnum) */
+	for (i = 0; i < numattrs; i++)
+	{
+		stats[i] = NULL;
+		for (j = 0; j < natts; j++)
+		{
+			if (attrs->values[i] == vacattrstats[j]->tupattnum)
+			{
+				stats[i] = vacattrstats[j];
+				break;
+			}
+		}
+
+		/*
+		 * Check that we found the info, that the attnum matches and
+		 * that there's the requested 'lt' operator and that the type
+		 * is 'passed-by-value'.
+		 */
+		Assert(stats[i] != NULL);
+		Assert(stats[i]->tupattnum == attrs->values[i]);
+
+		/* FIXME This is rather ugly way to check for 'ltopr' (which
+		 *       is defined for 'scalar' attributes).
+		 */
+		Assert(((StdAnalyzeData *)stats[i]->extra_data)->ltopr != InvalidOid);
+	}
+
+	return stats;
+}
+
+/*
+ * Fetch list of MV stats defined on a table, without the actual data
+ * for histograms, MCV lists etc.
+ */
+MVStats
+list_mv_stats(Oid relid, int *nstats, bool built_only)
+{
+	Relation	indrel;
+	SysScanDesc indscan;
+	ScanKeyData skey;
+	HeapTuple	htup;
+	MVStats		result;
+
+	/* start with 16 items, that should be enough for most cases */
+	int maxitems = 16;
+	result = (MVStats)palloc0(sizeof(MVStatsData) * maxitems);
+	*nstats = 0;
+
+	/* Prepare to scan pg_mv_statistic for entries having indrelid = this rel. */
+	ScanKeyInit(&skey,
+				Anum_pg_mv_statistic_starelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(relid));
+
+	indrel = heap_open(MvStatisticRelationId, AccessShareLock);
+	indscan = systable_beginscan(indrel, MvStatisticRelidIndexId, true,
+								 NULL, 1, &skey);
+
+	while (HeapTupleIsValid(htup = systable_getnext(indscan)))
+	{
+		Form_pg_mv_statistic stats = (Form_pg_mv_statistic) GETSTRUCT(htup);
+
+		/*
+		 * Skip statistics that were not computed yet (if only stats
+		 * that were already built were requested)
+		 */
+		if (built_only && (! stats->deps_built))
+			continue;
+
+		/* double the array size if needed */
+		if (*nstats == maxitems)
+		{
+			maxitems *= 2;
+			result = (MVStats)repalloc(result, sizeof(MVStatsData) * maxitems);
+		}
+
+		result[*nstats].mvoid = HeapTupleGetOid(htup);
+		result[*nstats].stakeys = buildint2vector(stats->stakeys.values, stats->stakeys.dim1);
+		result[*nstats].deps_built = stats->deps_built;
+		*nstats += 1;
+	}
+
+	systable_endscan(indscan);
+
+	heap_close(indrel, AccessShareLock);
+
+	/* TODO maybe save the list into relcache, as in RelationGetIndexList
+	 *      (which was used as an inspiration of this one)?. */
+
+	return result;
+}
+
+void
+update_mv_stats(Oid mvoid, MVDependencies dependencies)
+{
+	HeapTuple	stup,
+				oldtup;
+	Datum		values[Natts_pg_mv_statistic];
+	bool		nulls[Natts_pg_mv_statistic];
+	bool		replaces[Natts_pg_mv_statistic];
+
+	Relation	sd = heap_open(MvStatisticRelationId, RowExclusiveLock);
+
+	memset(nulls,    1, Natts_pg_mv_statistic * sizeof(bool));
+	memset(replaces, 0, Natts_pg_mv_statistic * sizeof(bool));
+	memset(values,   0, Natts_pg_mv_statistic * sizeof(Datum));
+
+	/*
+	 * Construct a new pg_mv_statistic tuple - replace only the histogram
+	 * and MCV list, depending whether it actually was computed.
+	 */
+	if (dependencies != NULL)
+	{
+		nulls[Anum_pg_mv_statistic_stadeps -1]    = false;
+		values[Anum_pg_mv_statistic_stadeps  - 1]
+			= PointerGetDatum(serialize_mv_dependencies(dependencies));
+	}
+
+	/* always replace the value (either by bytea or NULL) */
+	replaces[Anum_pg_mv_statistic_stadeps -1] = true;
+
+	/* always change the availability flags */
+	nulls[Anum_pg_mv_statistic_deps_built -1] = false;
+
+	replaces[Anum_pg_mv_statistic_deps_built-1] = true;
+
+	values[Anum_pg_mv_statistic_deps_built-1] = BoolGetDatum(dependencies != NULL);
+
+	/* Is there already a pg_mv_statistic tuple for this attribute? */
+	oldtup = SearchSysCache1(MVSTATOID,
+							 ObjectIdGetDatum(mvoid));
+
+	if (HeapTupleIsValid(oldtup))
+	{
+		/* Yes, replace it */
+		stup = heap_modify_tuple(oldtup,
+								 RelationGetDescr(sd),
+								 values,
+								 nulls,
+								 replaces);
+		ReleaseSysCache(oldtup);
+		simple_heap_update(sd, &stup->t_self, stup);
+	}
+	else
+		elog(ERROR, "invalid pg_mv_statistic record (oid=%d)", mvoid);
+
+	/* update indexes too */
+	CatalogUpdateIndexes(sd, stup);
+
+	heap_freetuple(stup);
+
+	heap_close(sd, RowExclusiveLock);
+}
+
+/* multi-variate stats comparator */
+
+/*
+ * qsort_arg comparator for sorting Datums (MV stats)
+ *
+ * This does not maintain the tupnoLink array.
+ */
+int
+compare_scalars_simple(const void *a, const void *b, void *arg)
+{
+	Datum		da = *(Datum*)a;
+	Datum		db = *(Datum*)b;
+	SortSupport ssup= (SortSupport) arg;
+
+	return ApplySortComparator(da, false, db, false, ssup);
+}
+
+/*
+ * qsort_arg comparator for sorting data when partitioning a MV bucket
+ */
+int
+compare_scalars_partition(const void *a, const void *b, void *arg)
+{
+	Datum		da = ((ScalarItem*)a)->value;
+	Datum		db = ((ScalarItem*)b)->value;
+	SortSupport ssup= (SortSupport) arg;
+
+	return ApplySortComparator(da, false, db, false, ssup);
+}
+
+/* initialize multi-dimensional sort */
+MultiSortSupport
+multi_sort_init(int ndims)
+{
+	MultiSortSupport mss;
+
+	Assert(ndims >= 2);
+
+	mss = (MultiSortSupport)palloc0(offsetof(MultiSortSupportData, ssup)
+									+ sizeof(SortSupportData)*ndims);
+
+	mss->ndims = ndims;
+
+	return mss;
+}
+
+/*
+ * add sort into for dimension 'dim' (index into vacattrstats) to mss,
+ * at the position 'sortattr'
+ */
+void
+multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
+						int dim, VacAttrStats **vacattrstats)
+{
+	/* first, lookup StdAnalyzeData for the dimension (attribute) */
+	SortSupportData ssup;
+	StdAnalyzeData *tmp = (StdAnalyzeData *)vacattrstats[dim]->extra_data;
+
+	Assert(mss != NULL);
+	Assert(sortdim < mss->ndims);
+
+	/* initialize sort support, etc. */
+	memset(&ssup, 0, sizeof(ssup));
+	ssup.ssup_cxt = CurrentMemoryContext;
+
+	/* We always use the default collation for statistics */
+	ssup.ssup_collation = DEFAULT_COLLATION_OID;
+	ssup.ssup_nulls_first = false;
+
+	PrepareSortSupportFromOrderingOp(tmp->ltopr, &ssup);
+
+	mss->ssup[sortdim] = ssup;
+}
+
+/* compare all the dimensions in the selected order */
+int
+multi_sort_compare(const void *a, const void *b, void *arg)
+{
+	int i;
+	SortItem *ia = (SortItem*)a;
+	SortItem *ib = (SortItem*)b;
+
+	MultiSortSupport mss = (MultiSortSupport)arg;
+
+	for (i = 0; i < mss->ndims; i++)
+	{
+		int	compare;
+
+		compare = ApplySortComparator(ia->values[i], ia->isnull[i],
+									  ib->values[i], ib->isnull[i],
+									  &mss->ssup[i]);
+
+		if (compare != 0)
+			return compare;
+
+	}
+
+	/* equal by default */
+	return 0;
+}
+
+/* compare selected dimension */
+int
+multi_sort_compare_dim(int dim, const SortItem *a, const SortItem *b,
+					   MultiSortSupport mss)
+{
+	return ApplySortComparator(a->values[dim], a->isnull[dim],
+							   b->values[dim], b->isnull[dim],
+							   &mss->ssup[dim]);
+}
diff --git a/src/backend/utils/mvstats/common.h b/src/backend/utils/mvstats/common.h
new file mode 100644
index 0000000..6d5465b
--- /dev/null
+++ b/src/backend/utils/mvstats/common.h
@@ -0,0 +1,75 @@
+/*-------------------------------------------------------------------------
+ *
+ * common.h
+ *	  POSTGRES multivariate statistics
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mvstats/common.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/tuptoaster.h"
+#include "catalog/indexing.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pg_mv_statistic.h"
+#include "foreign/fdwapi.h"
+#include "postmaster/autovacuum.h"
+#include "storage/lmgr.h"
+#include "utils/datum.h"
+#include "utils/sortsupport.h"
+#include "utils/syscache.h"
+#include "utils/fmgroids.h"
+#include "utils/builtins.h"
+#include "access/sysattr.h"
+
+#include "utils/mvstats.h"
+
+/* FIXME private structure copied from analyze.c */
+
+typedef struct
+{
+	Oid			eqopr;			/* '=' operator for datatype, if any */
+	Oid			eqfunc;			/* and associated function */
+	Oid			ltopr;			/* '<' operator for datatype, if any */
+} StdAnalyzeData;
+
+typedef struct
+{
+	Datum		value;			/* a data value */
+	int			tupno;			/* position index for tuple it came from */
+} ScalarItem;
+ 
+/* multi-sort */
+typedef struct MultiSortSupportData {
+	int				ndims;		/* number of dimensions supported by the */
+	SortSupportData	ssup[1];	/* sort support data for each dimension */
+} MultiSortSupportData;
+
+typedef MultiSortSupportData* MultiSortSupport;
+
+typedef struct SortItem {
+	Datum  *values;
+	bool   *isnull;
+} SortItem;
+
+MultiSortSupport multi_sort_init(int ndims);
+
+void multi_sort_add_dimension(MultiSortSupport mss, int sortdim,
+							  int dim, VacAttrStats **vacattrstats);
+
+int multi_sort_compare(const void *a, const void *b, void *arg);
+
+int multi_sort_compare_dim(int dim, const SortItem *a,
+						   const SortItem *b, MultiSortSupport mss);
+
+/* comparators, used when constructing multivariate stats */
+int compare_scalars_simple(const void *a, const void *b, void *arg);
+int compare_scalars_partition(const void *a, const void *b, void *arg);
diff --git a/src/backend/utils/mvstats/dependencies.c b/src/backend/utils/mvstats/dependencies.c
new file mode 100644
index 0000000..9e7f294
--- /dev/null
+++ b/src/backend/utils/mvstats/dependencies.c
@@ -0,0 +1,680 @@
+/*-------------------------------------------------------------------------
+ *
+ * dependencies.c
+ *	  POSTGRES multivariate functional dependencies
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mvstats/dependencies.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "common.h"
+#include "utils/lsyscache.h"
+
+/*
+ * Mine functional dependencies between columns, in the form (A => B),
+ * meaning that a value in column 'A' determines value in 'B'. A simple
+ * artificial example may be a table created like this
+ *
+ *     CREATE TABLE deptest (a INT, b INT)
+ *        AS SELECT i, i/10 FROM generate_series(1,100000) s(i);
+ *
+ * Clearly, once we know the value for 'A' we can easily determine the
+ * value of 'B' by dividing (A/10). A more practical example may be
+ * addresses, where (ZIP code => city name), i.e. once we know the ZIP,
+ * we probably know which city it belongs to. Larger cities usually have
+ * multiple ZIP codes, so the dependency can't be reversed.
+ *
+ * Functional dependencies are a concept well described in relational
+ * theory, especially in definition of normalization and "normal forms".
+ * Wikipedia has a nice definition of a functional dependency [1]:
+ *
+ *     In a given table, an attribute Y is said to have a functional
+ *     dependency on a set of attributes X (written X -> Y) if and only
+ *     if each X value is associated with precisely one Y value. For
+ *     example, in an "Employee" table that includes the attributes
+ *     "Employee ID" and "Employee Date of Birth", the functional
+ *     dependency {Employee ID} -> {Employee Date of Birth} would hold.
+ *     It follows from the previous two sentences that each {Employee ID}
+ *     is associated with precisely one {Employee Date of Birth}.
+ *
+ * [1] http://en.wikipedia.org/wiki/Database_normalization
+ *
+ * Most datasets might be normalized not to contain any such functional
+ * dependencies, but sometimes it's not practical. In some cases it's
+ * actually a conscious choice to model the dataset in denormalized way,
+ * either because of performance or to make querying easier.
+ *
+ * The current implementation supports only dependencies between two
+ * columns, but this is merely a simplification of the initial patch.
+ * It's certainly useful to mine for dependencies involving multiple
+ * columns on the 'left' side, i.e. a condition for the dependency.
+ * That is dependencies [A,B] => C and so on.
+ *
+ * TODO The implementation may/should be smart enough not to mine both
+ *      [A => B] and [A,C => B], because the second dependency is a
+ *      consequence of the first one (if values of A determine values
+ *      of B, adding another column won't change that). The ANALYZE
+ *      should first analyze 1:1 dependencies, then 2:1 dependencies
+ *      (and skip the already identified ones), etc.
+ *
+ * For example the dependency [city name => zip code] is much weaker
+ * than [city name, state name => zip code], because there may be
+ * multiple cities with the same name in various states. It's not
+ * perfect though - there are probably cities with the same name within
+ * the same state, but this is relatively rare occurence hopefully.
+ * More about this in the section about dependency mining.
+ *
+ * Handling multiple columns on the right side is not necessary, as such
+ * dependencies may be decomposed into a set of dependencies with
+ * the same meaning, one for each column on the right side. For example
+ *
+ *     A => [B,C]
+ *
+ * is exactly the same as
+ *
+ *     (A => B) & (A => C).
+ *
+ * Of course, storing (A => [B, C]) may be more efficient thant storing
+ * the two dependencies (A => B) and (A => C) separately.
+ *
+ *
+ * Dependency mining (ANALYZE)
+ * ---------------------------
+ *
+ * The current build algorithm is rather simple - for each pair [A,B] of
+ * columns, the data are sorted lexicographically (first by A, then B),
+ * and then a number of metrics is computed by walking the sorted data.
+ *
+ * In general the algorithm counts distict values of A (forming groups
+ * thanks to the sorting), supporting or contradicting the hypothesis
+ * that A => B (i.e. that values of B are predetermined by A). If there
+ * are multiple values of B for a single value of A, it's counted as
+ * contradicting.
+ *
+ * A group may be neither supporting nor contradicting. To be counted as
+ * supporting, the group has to have at least min_group_size(=3) rows.
+ * Smaller 'supporting' groups are counted as neutral.
+ *
+ * Finally, the number of rows in supporting and contradicting groups is
+ * compared, and if there is at least 10x more supporting rows, the
+ * dependency is considered valid.
+ *
+ *
+ * Real-world datasets are imperfect - there may be errors (e.g. due to
+ * data-entry mistakes), or factually correct records, yet contradicting
+ * the dependency (e.g. when a city splits into two, but both keep the
+ * same ZIP code). A strict ANALYZE implementation (where the functional
+ * dependencies are identified) would ignore dependencies on such noisy
+ * data, making the approach unusable in practice.
+ *
+ * The proposed implementation attempts to handle such noisy cases
+ * gracefully, by tolerating small number of contradicting cases.
+ *
+ * In the future this might also perform some sort of test and decide
+ * whether it's worth building any other kind of multivariate stats,
+ * or whether the dependencies sufficiently describe the data. Or at
+ * least not build the MCV list / histogram on the implied columns.
+ * Such reduction would however make the 'verification' (see the next
+ * section) impossible.
+ *
+ *
+ * Clause reduction (planner/optimizer)
+ * ------------------------------------
+ *
+ * Apllying the dependencies is quite simple - given a list of clauses,
+ * try to apply all the dependencies. For example given clause list
+ *
+ *    (a = 1) AND (b = 1) AND (c = 1) AND (d < 100)
+ *
+ * and dependencies [a=>b] and [a=>d], this may be reduced to
+ *
+ *    (a = 1) AND (c = 1) AND (d < 100)
+ *
+ * The (d<100) can't be reduced as it's not an equality clause, so the
+ * dependency [a=>d] can't be applied.
+ *
+ * See clauselist_apply_dependencies() for more details.
+ *
+ * The problem with the reduction is that the query may use conditions
+ * that are not redundant, but in fact contradictory - e.g. the user
+ * may search for a ZIP code and a city name not matching the ZIP code.
+ *
+ * In such cases, the condition on the city name is not actually
+ * redundant, but actually contradictory (making the result empty), and
+ * removing it while estimating the cardinality will make the estimate
+ * worse.
+ *
+ * The current estimation assuming independence (and multiplying the
+ * selectivities) works better in this case, but only by utter luck.
+ *
+ * In some cases this might be verified using the other multivariate
+ * statistics - MCV lists and histograms. For MCV lists the verification
+ * might be very simple - peek into the list if there are any items
+ * matching the clause on the 'A' column (e.g. ZIP code), and if such
+ * item is found, check that the 'B' column matches the other clause.
+ * If it does not, the clauses are contradictory. We can't really say
+ * if such item was not found, except maybe restricting the selectivity
+ * using the MCV data (e.g. using min/max selectivity, or something).
+ *
+ * With histograms, it might work similarly - we can't check the values
+ * directly (because histograms use buckets, unlike MCV lists, storing
+ * the actual values). So we can only observe the buckets matching the
+ * clauses - if those buckets have very low frequency, it probably means
+ * the two clauses are incompatible.
+ *
+ * It's unclear what 'low frequency' is, but if one of the clauses is
+ * implied (automatically true because of the other clause), then
+ *
+ *     selectivity[clause(A)] = selectivity[clause(A) & clause(B)]
+ *
+ * So we might compute selectivity of the first clause (on the column
+ * A in dependency [A=>B]) - for example using regular statistics.
+ * And then check if the selectivity computed from the histogram is
+ * about the same (or significantly lower).
+ *
+ * The problem is that histograms work well only when the data ordering
+ * matches the natural meaning. For values that serve as labels - like
+ * city names or ZIP codes, or even generated IDs, histograms really
+ * don't work all that well. For example sorting cities by name won't
+ * match the sorting of ZIP codes, rendering the histogram unusable.
+ *
+ * The MCV are probably going to work much better, because they don't
+ * really assume any sort of ordering. And it's probably more appropriate
+ * for the label-like data.
+ *
+ * TODO Support dependencies with multiple columns on left/right.
+ *
+ * TODO Investigate using histogram and MCV list to confirm the
+ *      functional dependencies.
+ *
+ * TODO Investigate statistical testing of the distribution (to decide
+ *      whether it makes sense to build the histogram/MCV list).
+ *
+ * TODO Using a min/max of selectivities would probably make more sense
+ *      for the associated columns.
+ *
+ * TODO Consider eliminating the implied columns from the histogram and
+ *      MCV lists (but maybe that's not a good idea, because that'd make
+ *      it impossible to use these stats for non-equality clauses and
+ *      also it wouldn't be possible to use the stats for verification
+ *      of the dependencies as proposed in another TODO).
+ *
+ * TODO This builds a complete set of dependencies, i.e. including
+ *      transitive dependencies - if we identify [A => B] and [B => C],
+ *      we're likely to identify [A => C] too. It might be better to
+ *      keep only the minimal set of dependencies, i.e. prune all the
+ *      dependencies that we can recreate by transivitity.
+ *
+ *      There are two conceptual ways to do that:
+ *
+ *      (a) generate all the rules, and then prune the rules that may
+ *          be recteated by combining other dependencies, or
+ *
+ *      (b) performing the 'is combination of other dependencies' check
+ *          before actually doing the work
+ *
+ *      The second option has the advantage that we don't really need
+ *      to perform the sort/count. It's not sufficient alone, though,
+ *      because we may discover the dependencies in the wrong order.
+ *      For example [A => B], [A => C] and then [B => C]. None of those
+ *      dependencies is a combination of the already known ones, yet
+ *      [A => C] is a combination of [A => B] and [B => C].
+ *
+ * FIXME Not sure the current NULL handling makes much sense. We assume
+ *       that NULL is 0, so it's handled like a regular value
+ *       (NULL == NULL), so all NULLs in a single column form a single
+ *       group. Maybe that's not the right thing to do, especially with
+ *       equality conditions - in that case NULLs are irrelevant. So
+ *       maybe the right solution would be to just ignore NULL values?
+ *
+ *       However simply "ignoring" the NULL values does not seem like
+ *       a good idea - imagine columns A and B, where for each value of
+ *       A, values in B are constant (same for the whole group) or NULL.
+ *       Let's say only 10% of B values in each group is not NULL. Then
+ *       ignoring the NULL values will result in 10x misestimate (and
+ *       it's trivial to construct arbitrary errors). So maybe handling
+ *       NULL values just like a regular value is the right thing here.
+ *
+ *       Or maybe NULL values should be treated differently on each side
+ *       of the dependency? E.g. as ignored on the left (condition) and
+ *       as regular values on the right - this seems consistent with how
+ *       equality clauses work, as equality clause means 'NOT NULL'.
+ *       So if we say [A => B] then it may also imply "NOT NULL" on the
+ *       right side.
+ */
+MVDependencies
+build_mv_dependencies(int numrows, HeapTuple *rows, int2vector *attrs,
+					  VacAttrStats **stats)
+{
+	int i;
+	int numattrs = attrs->dim1;
+
+	/* result */
+	int ndeps = 0;
+	MVDependencies	dependencies = NULL;
+	MultiSortSupport mss = multi_sort_init(2);	/* 2 dimensions for now */
+
+	/* TODO Maybe this should be somehow related to the number of
+	 *      distinct columns in the two columns we're currently analyzing.
+	 *      Assuming the distribution is uniform, we should expected to
+	 *      observe in the sample - we can then use the average group
+	 *      size as a threshold. That seems better than a static approach.
+	 */
+	int min_group_size = 3;
+
+	/* dimension indexes we'll check for associations [a => b] */
+	int dima, dimb;
+
+	/*
+	 * We'll reuse the same array for all the 2-column combinations.
+	 *
+	 * It's possible to sort the sample rows directly, but this seemed
+	 * somehow simples / less error prone. Another option would be to
+	 * allocate the arrays for each SortItem separately, but that'd be
+	 * significant overhead (not just CPU, but especially memory bloat).
+	 */
+	SortItem * items = (SortItem*)palloc0(numrows * sizeof(SortItem));
+
+	Datum *values = (Datum*)palloc0(sizeof(Datum) * numrows * 2);
+	bool  *isnull = (bool*)palloc0(sizeof(bool) * numrows * 2);
+
+	for (i = 0; i < numrows; i++)
+	{
+		items[i].values = &values[i * 2];
+		items[i].isnull = &isnull[i * 2];
+	}
+
+	Assert(numattrs >= 2);
+
+	/*
+	 * Evaluate all possible combinations of [A => B], using a simple algorithm:
+	 *
+	 * (a) sort the data by [A,B]
+	 * (b) split the data into groups by A (new group whenever a value changes)
+	 * (c) count different values in the B column (again, value changes)
+	 *
+	 * TODO It should be rather simple to merge [A => B] and [A => C] into
+	 *      [A => B,C]. Just keep A constant, collect all the "implied" columns
+	 *      and you're done.
+	 */
+	for (dima = 0; dima < numattrs; dima++)
+	{
+		/* prepare the sort function for the first dimension */
+		multi_sort_add_dimension(mss, 0, dima, stats);
+
+		for (dimb = 0; dimb < numattrs; dimb++)
+		{
+			SortItem current;
+
+			/* number of groups supporting / contradicting the dependency */
+			int n_supporting = 0;
+			int n_contradicting = 0;
+
+			/* counters valid within a group */
+			int group_size = 0;
+			int n_violations = 0;
+
+			int n_supporting_rows = 0;
+			int n_contradicting_rows = 0;
+
+			/* make sure the columns are different (A => A) */
+			if (dima == dimb)
+				continue;
+
+			/* prepare the sort function for the second dimension */
+			multi_sort_add_dimension(mss, 1, dimb, stats);
+
+			/* reset the values and isnull flags */
+			memset(values, 0, sizeof(Datum) * numrows * 2);
+			memset(isnull, 0, sizeof(bool)  * numrows * 2);
+
+			/* accumulate all the data for both columns into an array and sort it */
+			for (i = 0; i < numrows; i++)
+			{
+				items[i].values[0]
+					= heap_getattr(rows[i], attrs->values[dima],
+									stats[dima]->tupDesc, &items[i].isnull[0]);
+
+				items[i].values[1]
+					= heap_getattr(rows[i], attrs->values[dimb],
+									stats[dimb]->tupDesc, &items[i].isnull[1]);
+			}
+
+			qsort_arg((void *) items, numrows, sizeof(SortItem),
+					  multi_sort_compare, mss);
+
+			/*
+			 * Walk through the array, split it into rows according to
+			 * the A value, and count distinct values in the other one.
+			 * If there's a single B value for the whole group, we count
+			 * it as supporting the association, otherwise we count it
+			 * as contradicting.
+			 *
+			 * Furthermore we require a group to have at least a certain
+			 * number of rows to be considered useful for supporting the
+			 * dependency. But when it's contradicting, use it always useful.
+			 */
+
+			/* start with values from the first row */
+			current = items[0];
+			group_size  = 1;
+
+			for (i = 1; i < numrows; i++)
+			{
+				/* end of the group */
+				if (multi_sort_compare_dim(0, &items[i], &current, mss) != 0)
+				{
+					/*
+					 * If there are no contradicting rows, count it as
+					 * supporting (otherwise contradicting), but only if
+					 * the group is large enough.
+					 *
+					 * The requirement of a minimum group size makes it
+					 * impossible to identify [unique,unique] cases, but
+					 * that's probably a different case. This is more
+					 * about [zip => city] associations etc.
+					 *
+					 * If there are violations, count the group/rows as
+					 * a violation.
+					 *
+					 * It may ne neither, if the group is too small (does
+					 * not contain at least min_group_size rows).
+					 */
+					if ((n_violations == 0) && (group_size >= min_group_size))
+					{
+						n_supporting +=  1;
+						n_supporting_rows += group_size;
+					}
+					else if (n_violations > 0)
+					{
+						n_contradicting +=  1;
+						n_contradicting_rows += group_size;
+					}
+
+					/* current values start a new group */
+					n_violations = 0;
+					group_size = 0;
+				}
+				/* mismatch of a B value is contradicting */
+				else if (multi_sort_compare_dim(1, &items[i], &current, mss) != 0)
+				{
+					n_violations += 1;
+				}
+
+				current = items[i];
+				group_size += 1;
+			}
+
+			/* handle the last group (just like above) */
+			if ((n_violations == 0) && (group_size >= min_group_size))
+			{
+				n_supporting += 1;
+				n_supporting_rows += group_size;
+			}
+			else if (n_violations)
+			{
+				n_contradicting += 1;
+				n_contradicting_rows += group_size;
+			}
+
+			/*
+			 * See if the number of rows supporting the association is at least
+			 * 10x the number of rows violating the hypothetical dependency.
+			 *
+			 * TODO This is rather arbitrary limit - I guess it's possible to do
+			 *      some math to come up with a better rule (e.g. testing a hypothesis
+			 *      'this is due to randomness'). We can create a contingency table
+			 *      from the values and use it for testing. Possibly only when
+			 *      there are no contradicting rows?
+			 *
+			 * TODO Also, if (a => b) and (b => a) at the same time, it pretty much
+			 *      means there's a 1:1 relation (or one is a 'label'), making the
+			 *      conditions rather redundant. Although it's possible that the
+			 *      query uses incompatible combination of values.
+			 */
+			if (n_supporting_rows > (n_contradicting_rows * 10))
+			{
+				if (dependencies == NULL)
+				{
+					dependencies = (MVDependencies)palloc0(sizeof(MVDependenciesData));
+					dependencies->magic = MVSTAT_DEPS_MAGIC;
+				}
+				else
+					dependencies = repalloc(dependencies, offsetof(MVDependenciesData, deps)
+											+ sizeof(MVDependency) * (dependencies->ndeps + 1));
+
+				/* update the */
+				dependencies->deps[ndeps] = (MVDependency)palloc0(sizeof(MVDependencyData));
+				dependencies->deps[ndeps]->a = attrs->values[dima];
+				dependencies->deps[ndeps]->b = attrs->values[dimb];
+
+				dependencies->ndeps = (++ndeps);
+			}
+		}
+	}
+
+	pfree(items);
+	pfree(values);
+	pfree(isnull);
+	pfree(stats);
+	pfree(mss);
+
+	return dependencies;
+}
+
+/*
+ * Store the dependencies into a bytea, so that it can be stored in the
+ * pg_mv_statistic catalog.
+ *
+ * Currently this only supports simple two-column rules, and stores them
+ * as a sequence of attnum pairs. In the future, this needs to be made
+ * more complex to support multiple columns on both sides of the
+ * implication (using AND on left, OR on right).
+ */
+bytea *
+serialize_mv_dependencies(MVDependencies dependencies)
+{
+	int i;
+
+	/* we need to store ndeps, and each needs 2 * int16 */
+	Size len = VARHDRSZ + offsetof(MVDependenciesData, deps)
+				+ dependencies->ndeps * (sizeof(int16) * 2);
+
+	bytea * output = (bytea*)palloc0(len);
+
+	char * tmp = VARDATA(output);
+
+	SET_VARSIZE(output, len);
+
+	/* first, store the number of dimensions / items */
+	memcpy(tmp, dependencies, offsetof(MVDependenciesData, deps));
+	tmp += offsetof(MVDependenciesData, deps);
+
+	/* walk through the dependencies and copy both columns into the bytea */
+	for (i = 0; i < dependencies->ndeps; i++)
+	{
+		memcpy(tmp, &(dependencies->deps[i]->a), sizeof(int16));
+		tmp += sizeof(int16);
+
+		memcpy(tmp, &(dependencies->deps[i]->b), sizeof(int16));
+		tmp += sizeof(int16);
+	}
+
+	return output;
+}
+
+/*
+ * Reads serialized dependencies into MVDependencies structure.
+ */
+MVDependencies
+deserialize_mv_dependencies(bytea * data)
+{
+	int		i;
+	Size	expected_size;
+	MVDependencies	dependencies;
+	char   *tmp;
+
+	if (data == NULL)
+		return NULL;
+
+	if (VARSIZE_ANY_EXHDR(data) < offsetof(MVDependenciesData,deps))
+		elog(ERROR, "invalid MVDependencies size %ld (expected at least %ld)",
+			 VARSIZE_ANY_EXHDR(data), offsetof(MVDependenciesData,deps));
+
+	/* read the MVDependencies header */
+	dependencies = (MVDependencies)palloc0(sizeof(MVDependenciesData));
+
+	/* initialize pointer to the data part (skip the varlena header) */
+	tmp = VARDATA(data);
+
+	/* get the header and perform basic sanity checks */
+	memcpy(dependencies, tmp, offsetof(MVDependenciesData, deps));
+	tmp += offsetof(MVDependenciesData, deps);
+
+	if (dependencies->magic != MVSTAT_DEPS_MAGIC)
+	{
+		pfree(dependencies);
+		elog(WARNING, "not a MV Dependencies (magic number mismatch)");
+		return NULL;
+	}
+
+	Assert(dependencies->ndeps > 0);
+
+	/* what bytea size do we expect for those parameters */
+	expected_size = offsetof(MVDependenciesData,deps) +
+					dependencies->ndeps * sizeof(int16) * 2;
+
+	if (VARSIZE_ANY_EXHDR(data) != expected_size)
+		elog(ERROR, "invalid dependencies size %ld (expected %ld)",
+			 VARSIZE_ANY_EXHDR(data), expected_size);
+
+	/* allocate space for the MCV items */
+	dependencies = repalloc(dependencies, offsetof(MVDependenciesData,deps)
+							+ (dependencies->ndeps * sizeof(MVDependency)));
+
+	for (i = 0; i < dependencies->ndeps; i++)
+	{
+		dependencies->deps[i] = (MVDependency)palloc0(sizeof(MVDependencyData));
+
+		memcpy(&(dependencies->deps[i]->a), tmp, sizeof(int16));
+		tmp += sizeof(int16);
+
+		memcpy(&(dependencies->deps[i]->b), tmp, sizeof(int16));
+		tmp += sizeof(int16);
+	}
+
+	return dependencies;
+}
+
+/* print some basic info about dependencies (number of dependencies) */
+Datum
+pg_mv_stats_dependencies_info(PG_FUNCTION_ARGS)
+{
+	bytea	   *data = PG_GETARG_BYTEA_P(0);
+	char	   *result;
+
+	MVDependencies dependencies = deserialize_mv_dependencies(data);
+
+	if (dependencies == NULL)
+		PG_RETURN_NULL();
+
+	result = palloc0(128);
+	snprintf(result, 128, "dependencies=%d", dependencies->ndeps);
+
+	/* FIXME free the deserialized data (pfree is not enough) */
+
+	PG_RETURN_TEXT_P(cstring_to_text(result));
+}
+
+/* print the dependencies
+ *
+ * TODO  Would be nice if this knew the actual column names (instead of
+ *       the attnums).
+ *
+ * FIXME This is really ugly and does not really check the lengths and
+ *       strcpy/snprintf return values properly. Needs to be fixed.
+ */
+Datum
+pg_mv_stats_dependencies_show(PG_FUNCTION_ARGS)
+{
+	int			i = 0;
+	bytea	   *data = PG_GETARG_BYTEA_P(0);
+	char	   *result = NULL;
+	int			len = 0;
+
+	MVDependencies dependencies = deserialize_mv_dependencies(data);
+
+	if (dependencies == NULL)
+		PG_RETURN_NULL();
+
+	for (i = 0; i < dependencies->ndeps; i++)
+	{
+		MVDependency dependency = dependencies->deps[i];
+		char	buffer[128];
+
+		int		tmp = snprintf(buffer, 128, "%s%d => %d",
+				((i == 0) ? "" : ", "), dependency->a, dependency->b);
+
+		if (tmp < 127)
+		{
+			if (result == NULL)
+				result = palloc0(len + tmp + 1);
+			else
+				result = repalloc(result, len + tmp + 1);
+
+			strcpy(result + len, buffer);
+			len += tmp;
+		}
+	}
+
+	PG_RETURN_TEXT_P(cstring_to_text(result));
+}
+
+bytea *
+fetch_mv_dependencies(Oid mvoid)
+{
+	Relation	indrel;
+	SysScanDesc indscan;
+	ScanKeyData skey;
+	HeapTuple	htup;
+	bytea	   *stadeps = NULL;
+
+	/* Prepare to scan pg_mv_statistic for entries having indrelid = this rel. */
+	ScanKeyInit(&skey,
+				ObjectIdAttributeNumber,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(mvoid));
+
+	indrel = heap_open(MvStatisticRelationId, AccessShareLock);
+	indscan = systable_beginscan(indrel, MvStatisticOidIndexId, true,
+								 NULL, 1, &skey);
+
+	while (HeapTupleIsValid(htup = systable_getnext(indscan)))
+	{
+		bool isnull = false;
+		Datum deps = SysCacheGetAttr(MVSTATOID, htup,
+								   Anum_pg_mv_statistic_stadeps, &isnull);
+
+		Assert(!isnull);
+
+		stadeps = DatumGetByteaP(deps);
+
+		break;
+	}
+
+	systable_endscan(indscan);
+
+	heap_close(indrel, AccessShareLock);
+
+	/* TODO maybe save the list into relcache, as in RelationGetIndexList
+	 *      (which was used as an inspiration of this one)?. */
+
+	return stadeps;
+}
diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h
index a680229..22bb781 100644
--- a/src/include/catalog/indexing.h
+++ b/src/include/catalog/indexing.h
@@ -173,6 +173,11 @@ DECLARE_UNIQUE_INDEX(pg_largeobject_loid_pn_index, 2683, on pg_largeobject using
 DECLARE_UNIQUE_INDEX(pg_largeobject_metadata_oid_index, 2996, on pg_largeobject_metadata using btree(oid oid_ops));
 #define LargeObjectMetadataOidIndexId	2996
 
+DECLARE_UNIQUE_INDEX(pg_mv_statistic_oid_index, 3380, on pg_mv_statistic using btree(oid oid_ops));
+#define MvStatisticOidIndexId  3380
+DECLARE_INDEX(pg_mv_statistic_relid_index, 3379, on pg_mv_statistic using btree(starelid oid_ops));
+#define MvStatisticRelidIndexId	3379
+
 DECLARE_UNIQUE_INDEX(pg_namespace_nspname_index, 2684, on pg_namespace using btree(nspname name_ops));
 #define NamespaceNameIndexId  2684
 DECLARE_UNIQUE_INDEX(pg_namespace_oid_index, 2685, on pg_namespace using btree(oid oid_ops));
diff --git a/src/include/catalog/pg_mv_statistic.h b/src/include/catalog/pg_mv_statistic.h
new file mode 100644
index 0000000..81ec23b
--- /dev/null
+++ b/src/include/catalog/pg_mv_statistic.h
@@ -0,0 +1,69 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_mv_statistic.h
+ *	  definition of the system "multivariate statistic" relation (pg_mv_statistic)
+ *	  along with the relation's initial contents.
+ *
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/catalog/pg_mv_statistic.h
+ *
+ * NOTES
+ *	  the genbki.pl script reads this file and generates .bki
+ *	  information from the DATA() statements.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_MV_STATISTIC_H
+#define PG_MV_STATISTIC_H
+
+#include "catalog/genbki.h"
+
+/* ----------------
+ *		pg_mv_statistic definition.  cpp turns this into
+ *		typedef struct FormData_pg_mv_statistic
+ * ----------------
+ */
+#define MvStatisticRelationId  3381
+
+CATALOG(pg_mv_statistic,3381)
+{
+	/* These fields form the unique key for the entry: */
+	Oid			starelid;		/* relation containing attributes */
+
+	/* statistics requested to build */
+	bool		deps_enabled;		/* analyze dependencies? */
+
+	/* statistics that are available (if requested) */
+	bool		deps_built;			/* dependencies were built */
+
+	/* variable-length fields start here, but we allow direct access to stakeys */
+	int2vector	stakeys;			/* array of column keys */
+
+#ifdef CATALOG_VARLEN
+	bytea		stadeps;			/* dependencies (serialized) */
+#endif
+
+} FormData_pg_mv_statistic;
+
+/* ----------------
+ *		Form_pg_mv_statistic corresponds to a pointer to a tuple with
+ *		the format of pg_mv_statistic relation.
+ * ----------------
+ */
+typedef FormData_pg_mv_statistic *Form_pg_mv_statistic;
+
+/* ----------------
+ *		compiler constants for pg_attrdef
+ * ----------------
+ */
+#define Natts_pg_mv_statistic					5
+#define Anum_pg_mv_statistic_starelid			1
+#define Anum_pg_mv_statistic_deps_enabled		2
+#define Anum_pg_mv_statistic_deps_built			3
+#define Anum_pg_mv_statistic_stakeys			4
+#define Anum_pg_mv_statistic_stadeps			5
+
+#endif   /* PG_MV_STATISTIC_H */
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 8890ade..f728d88 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -2712,6 +2712,11 @@ DESCR("current user privilege on any column by rel name");
 DATA(insert OID = 3029 (  has_any_column_privilege	   PGNSP PGUID 12 10 0 0 0 f f f f t f s 2 0 16 "26 25" _null_ _null_ _null_ _null_ has_any_column_privilege_id _null_ _null_ _null_ ));
 DESCR("current user privilege on any column by rel oid");
 
+DATA(insert OID = 3284 (  pg_mv_stats_dependencies_info     PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 25 "17" _null_ _null_ _null_ _null_ pg_mv_stats_dependencies_info _null_ _null_ _null_ ));
+DESCR("multivariate stats: functional dependencies info");
+DATA(insert OID = 3285 (  pg_mv_stats_dependencies_show     PGNSP PGUID 12 1 0 0 0 f f f f t f i 1 0 25 "17" _null_ _null_ _null_ _null_ pg_mv_stats_dependencies_show _null_ _null_ _null_ ));
+DESCR("multivariate stats: functional dependencies show");
+
 DATA(insert OID = 1928 (  pg_stat_get_numscans			PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_numscans _null_ _null_ _null_ ));
 DESCR("statistics: number of scans done for table/index");
 DATA(insert OID = 1929 (  pg_stat_get_tuples_returned	PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_tuples_returned _null_ _null_ _null_ ));
diff --git a/src/include/catalog/toasting.h b/src/include/catalog/toasting.h
index fb2f035..724a169 100644
--- a/src/include/catalog/toasting.h
+++ b/src/include/catalog/toasting.h
@@ -49,6 +49,7 @@ extern void BootstrapToastTable(char *relName,
 DECLARE_TOAST(pg_attrdef, 2830, 2831);
 DECLARE_TOAST(pg_constraint, 2832, 2833);
 DECLARE_TOAST(pg_description, 2834, 2835);
+DECLARE_TOAST(pg_mv_statistic, 3288, 3289);
 DECLARE_TOAST(pg_proc, 2836, 2837);
 DECLARE_TOAST(pg_rewrite, 2838, 2839);
 DECLARE_TOAST(pg_seclabel, 3598, 3599);
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 38469ef..3a0e7c4 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -414,6 +414,7 @@ typedef enum NodeTag
 	T_WithClause,
 	T_CommonTableExpr,
 	T_RoleSpec,
+	T_StatisticsDef,
 
 	/*
 	 * TAGS FOR REPLICATION GRAMMAR PARSE NODES (replnodes.h)
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 2893cef..81ca159 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -570,6 +570,14 @@ typedef struct ColumnDef
 	int			location;		/* parse location, or -1 if none/unknown */
 } ColumnDef;
 
+typedef struct StatisticsDef
+{
+	NodeTag		type;
+	List	   *keys;			/* String nodes naming referenced column(s) */
+	List	   *options;		/* list of DefElem nodes */
+} StatisticsDef;
+
+
 /*
  * TableLikeClause - CREATE TABLE ( ... LIKE ... ) clause
  */
@@ -1362,7 +1370,8 @@ typedef enum AlterTableType
 	AT_ReplicaIdentity,			/* REPLICA IDENTITY */
 	AT_EnableRowSecurity,		/* ENABLE ROW SECURITY */
 	AT_DisableRowSecurity,		/* DISABLE ROW SECURITY */
-	AT_GenericOptions			/* OPTIONS (...) */
+	AT_GenericOptions,			/* OPTIONS (...) */
+	AT_AddStatistics			/* add statistics */
 } AlterTableType;
 
 typedef struct ReplicaIdentityStmt
diff --git a/src/include/utils/mvstats.h b/src/include/utils/mvstats.h
new file mode 100644
index 0000000..5c8643d
--- /dev/null
+++ b/src/include/utils/mvstats.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * mvstats.h
+ *	  Multivariate statistics and selectivity estimation functions.
+ *
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/utils/mvstats.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MVSTATS_H
+#define MVSTATS_H
+
+#include "commands/vacuum.h"
+
+/*
+ * Basic info about the stats, used when choosing what to use
+ *
+ * TODO Add info about what statistics is available (histogram, MCV,
+ *      hashed MCV, functional dependencies).
+ */
+typedef struct MVStatsData {
+	Oid			mvoid;		/* OID of the stats in pg_mv_statistic */
+	int2vector *stakeys;	/* attnums for columns in the stats */
+	bool		deps_built;	/* functional dependencies available */
+} MVStatsData;
+
+typedef struct MVStatsData *MVStats;
+
+
+#define MVSTATS_MAX_DIMENSIONS	8		/* max number of attributes */
+
+/* An associative rule, tracking [a => b] dependency.
+ *
+ * TODO Make this work with multiple columns on both sides.
+ */
+typedef struct MVDependencyData {
+	int16	a;
+	int16	b;
+} MVDependencyData;
+
+typedef MVDependencyData* MVDependency;
+
+typedef struct MVDependenciesData {
+	uint32			magic;		/* magic constant marker */
+	int32			ndeps;		/* number of dependencies */
+	MVDependency	deps[1];	/* XXX why not a pointer? */
+} MVDependenciesData;
+
+typedef MVDependenciesData* MVDependencies;
+
+#define MVSTAT_DEPS_MAGIC		0xB4549A2C	/* marks serialized bytea */
+#define MVSTAT_DEPS_TYPE_BASIC	1			/* basic dependencies type */
+
+/*
+ * TODO Maybe fetching the histogram/MCV list separately is inefficient?
+ *      Consider adding a single `fetch_stats` method, fetching all
+ *      stats specified using flags (or something like that).
+ */
+MVStats list_mv_stats(Oid relid, int *nstats, bool built_only);
+
+bytea * fetch_mv_dependencies(Oid mvoid);
+
+bytea * serialize_mv_dependencies(MVDependencies dependencies);
+
+/* deserialization of stats (serialization is private to analyze) */
+MVDependencies	deserialize_mv_dependencies(bytea * data);
+
+/* FIXME this probably belongs somewhere else (not to operations stats) */
+extern Datum pg_mv_stats_dependencies_info(PG_FUNCTION_ARGS);
+extern Datum pg_mv_stats_dependencies_show(PG_FUNCTION_ARGS);
+
+MVDependencies
+build_mv_dependencies(int numrows, HeapTuple *rows,
+								  int2vector *attrs,
+								  VacAttrStats **stats);
+
+void build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
+						   int natts, VacAttrStats **vacattrstats);
+
+void update_mv_stats(Oid relid, MVDependencies dependencies);
+
+#endif
diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h
index ba0b090..12147ab 100644
--- a/src/include/utils/syscache.h
+++ b/src/include/utils/syscache.h
@@ -66,6 +66,7 @@ enum SysCacheIdentifier
 	INDEXRELID,
 	LANGNAME,
 	LANGOID,
+	MVSTATOID,
 	NAMESPACENAME,
 	NAMESPACEOID,
 	OPERNAMENSP,
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 1788270..f0117ca 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1353,6 +1353,14 @@ pg_matviews| SELECT n.nspname AS schemaname,
      LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)))
      LEFT JOIN pg_tablespace t ON ((t.oid = c.reltablespace)))
   WHERE (c.relkind = 'm'::"char");
+pg_mv_stats| SELECT n.nspname AS schemaname,
+    c.relname AS tablename,
+    s.stakeys AS attnums,
+    length(s.stadeps) AS depsbytes,
+    pg_mv_stats_dependencies_info(s.stadeps) AS depsinfo
+   FROM ((pg_mv_statistic s
+     JOIN pg_class c ON ((c.oid = s.starelid)))
+     LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)));
 pg_policies| SELECT n.nspname AS schemaname,
     c.relname AS tablename,
     pol.polname AS policyname,
diff --git a/src/test/regress/expected/sanity_check.out b/src/test/regress/expected/sanity_check.out
index c7be273..00f5fe7 100644
--- a/src/test/regress/expected/sanity_check.out
+++ b/src/test/regress/expected/sanity_check.out
@@ -113,6 +113,7 @@ pg_inherits|t
 pg_language|t
 pg_largeobject|t
 pg_largeobject_metadata|t
+pg_mv_statistic|t
 pg_namespace|t
 pg_opclass|t
 pg_operator|t
-- 
2.0.5