>From 55211180c650c22924a4b0a261e7d36ec83c0d8c Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Wed, 23 Dec 2015 02:07:58 +0100
Subject: [PATCH 7/7] initial version of ndistinct conefficient statistics

---
 src/backend/commands/statscmds.c       |  11 ++-
 src/backend/optimizer/path/clausesel.c |   7 ++
 src/backend/optimizer/util/plancat.c   |   4 +-
 src/backend/utils/mvstats/Makefile     |   2 +-
 src/backend/utils/mvstats/common.c     |  20 ++++-
 src/backend/utils/mvstats/mvdist.c     | 147 +++++++++++++++++++++++++++++++++
 src/include/catalog/pg_mv_statistic.h  |  26 +++---
 src/include/nodes/relation.h           |   2 +
 src/include/utils/mvstats.h            |   6 ++
 9 files changed, 208 insertions(+), 17 deletions(-)
 create mode 100644 src/backend/utils/mvstats/mvdist.c

diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
index 68e1685..de140b4 100644
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -136,7 +136,8 @@ CreateStatistics(CreateStatsStmt *stmt)
 	/* by default build nothing */
 	bool 	build_dependencies = false,
 			build_mcv = false,
-			build_histogram = false;
+			build_histogram = false,
+			build_ndistinct = false;
 
 	int32 	max_buckets = -1,
 			max_mcv_items = -1;
@@ -200,6 +201,8 @@ CreateStatistics(CreateStatsStmt *stmt)
 
 		if (strcmp(opt->defname, "dependencies") == 0)
 			build_dependencies = defGetBoolean(opt);
+		else if (strcmp(opt->defname, "ndistinct") == 0)
+			build_ndistinct = defGetBoolean(opt);
 		else if (strcmp(opt->defname, "mcv") == 0)
 			build_mcv = defGetBoolean(opt);
 		else if (strcmp(opt->defname, "max_mcv_items") == 0)
@@ -254,10 +257,10 @@ CreateStatistics(CreateStatsStmt *stmt)
 	}
 
 	/* check that at least some statistics were requested */
-	if (! (build_dependencies || build_mcv || build_histogram))
+	if (! (build_dependencies || build_mcv || build_histogram || build_ndistinct))
 		ereport(ERROR,
 				(errcode(ERRCODE_SYNTAX_ERROR),
-				 errmsg("no statistics type (dependencies, mcv, histogram) was requested")));
+				 errmsg("no statistics type (dependencies, mcv, histogram, ndistinct) was requested")));
 
 	/* now do some checking of the options */
 	if (require_mcv && (! build_mcv))
@@ -291,6 +294,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 	values[Anum_pg_mv_statistic_deps_enabled -1] = BoolGetDatum(build_dependencies);
 	values[Anum_pg_mv_statistic_mcv_enabled  -1] = BoolGetDatum(build_mcv);
 	values[Anum_pg_mv_statistic_hist_enabled -1] = BoolGetDatum(build_histogram);
+	values[Anum_pg_mv_statistic_ndist_enabled-1] = BoolGetDatum(build_ndistinct);
 
 	values[Anum_pg_mv_statistic_mcv_max_items    -1] = Int32GetDatum(max_mcv_items);
 	values[Anum_pg_mv_statistic_hist_max_buckets -1] = Int32GetDatum(max_buckets);
@@ -298,6 +302,7 @@ CreateStatistics(CreateStatsStmt *stmt)
 	nulls[Anum_pg_mv_statistic_stadeps  -1] = true;
 	nulls[Anum_pg_mv_statistic_stamcv   -1] = true;
 	nulls[Anum_pg_mv_statistic_stahist  -1] = true;
+	nulls[Anum_pg_mv_statistic_standist -1] = true;
 
 	/* insert the tuple into pg_mv_statistic */
 	mvstatrel = heap_open(MvStatisticRelationId, RowExclusiveLock);
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index 8d15d3c8..c717f96 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -59,6 +59,7 @@ static void addRangeClause(RangeQueryClause **rqlist, Node *clause,
 #define		MV_CLAUSE_TYPE_FDEP		0x01
 #define		MV_CLAUSE_TYPE_MCV		0x02
 #define		MV_CLAUSE_TYPE_HIST		0x04
+#define		MV_CLAUSE_TYPE_NDIST	0x08
 
 static bool clause_is_mv_compatible(PlannerInfo *root, Node *clause, Oid varRelid,
 							 Index *relid, Bitmapset **attnums, SpecialJoinInfo *sjinfo,
@@ -377,6 +378,9 @@ clauselist_selectivity(PlannerInfo *root,
 													stats, sjinfo);
 	}
 
+	if (has_stats(stats, MV_CLAUSE_TYPE_NDIST))
+		elog(WARNING, "has ndistinct coefficient stats");
+
 	/*
 	 * Check that there are statistics with MCV list or histogram.
 	 * If not, we don't need to waste time with the optimization.
@@ -2931,6 +2935,9 @@ has_stats(List *stats, int type)
 
 		if ((type & MV_CLAUSE_TYPE_HIST) && stat->hist_built)
 			return true;
+
+		if ((type & MV_CLAUSE_TYPE_NDIST) && stat->ndist_built)
+			return true;
 	}
 
 	return false;
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 9aded52..f4edfe6 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -410,7 +410,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			mvstat = (Form_pg_mv_statistic) GETSTRUCT(htup);
 
 			/* unavailable stats are not interesting for the planner */
-			if (mvstat->deps_built || mvstat->mcv_built || mvstat->hist_built)
+			if (mvstat->deps_built || mvstat->mcv_built || mvstat->hist_built || mvstat->ndist_built)
 			{
 				info = makeNode(MVStatisticInfo);
 
@@ -421,11 +421,13 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->deps_enabled = mvstat->deps_enabled;
 				info->mcv_enabled  = mvstat->mcv_enabled;
 				info->hist_enabled = mvstat->hist_enabled;
+				info->ndist_enabled = mvstat->ndist_enabled;
 
 				/* built/available statistics */
 				info->deps_built = mvstat->deps_built;
 				info->mcv_built  = mvstat->mcv_built;
 				info->hist_built = mvstat->hist_built;
+				info->ndist_built = mvstat->ndist_built;
 
 				/* stakeys */
 				adatum = SysCacheGetAttr(MVSTATOID, htup,
diff --git a/src/backend/utils/mvstats/Makefile b/src/backend/utils/mvstats/Makefile
index 9dbb3b6..d4b88e9 100644
--- a/src/backend/utils/mvstats/Makefile
+++ b/src/backend/utils/mvstats/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/utils/mvstats
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = common.o dependencies.o histogram.o mcv.o
+OBJS = common.o dependencies.o histogram.o mcv.o mvdist.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/mvstats/common.c b/src/backend/utils/mvstats/common.c
index ffb76f4..c42ca8f 100644
--- a/src/backend/utils/mvstats/common.c
+++ b/src/backend/utils/mvstats/common.c
@@ -53,6 +53,7 @@ build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
 		MVDependencies	deps  = NULL;
 		MCVList		mcvlist   = NULL;
 		MVHistogram	histogram = NULL;
+		double		ndist	  = -1;
 		int numrows_filtered  = numrows;
 
 		VacAttrStats  **stats  = NULL;
@@ -92,6 +93,9 @@ build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
 		if (stat->deps_enabled)
 			deps = build_mv_dependencies(numrows, rows, attrs, stats);
 
+		if (stat->ndist_enabled)
+			ndist = build_mv_ndistinct(numrows, rows, attrs, stats);
+
 		/* build the MCV list */
 		if (stat->mcv_enabled)
 			mcvlist = build_mv_mcvlist(numrows, rows, attrs, stats, &numrows_filtered);
@@ -101,7 +105,7 @@ build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
 			histogram = build_mv_histogram(numrows_filtered, rows, attrs, stats, numrows);
 
 		/* store the histogram / MCV list in the catalog */
-		update_mv_stats(stat->mvoid, deps, mcvlist, histogram, attrs, stats);
+		update_mv_stats(stat->mvoid, deps, mcvlist, histogram, ndist, attrs, stats);
 	}
 }
 
@@ -183,6 +187,8 @@ list_mv_stats(Oid relid)
 		info->mcv_built = stats->mcv_built;
 		info->hist_enabled = stats->hist_enabled;
 		info->hist_built = stats->hist_built;
+		info->ndist_enabled = stats->ndist_enabled;
+		info->ndist_built = stats->ndist_built;
 
 		result = lappend(result, info);
 	}
@@ -252,7 +258,7 @@ find_mv_attnums(Oid mvoid, Oid *relid)
 void
 update_mv_stats(Oid mvoid,
 				MVDependencies dependencies, MCVList mcvlist, MVHistogram histogram,
-				int2vector *attrs, VacAttrStats **stats)
+				double ndistcoeff, int2vector *attrs, VacAttrStats **stats)
 {
 	HeapTuple	stup,
 				oldtup;
@@ -292,26 +298,36 @@ update_mv_stats(Oid mvoid,
 			= PointerGetDatum(data);
 	}
 
+	if (ndistcoeff > 1.0)
+	{
+		nulls[Anum_pg_mv_statistic_standist -1] = false;
+		values[Anum_pg_mv_statistic_standist-1] = Float8GetDatum(ndistcoeff);
+	}
+
 	/* always replace the value (either by bytea or NULL) */
 	replaces[Anum_pg_mv_statistic_stadeps -1] = true;
 	replaces[Anum_pg_mv_statistic_stamcv -1] = true;
 	replaces[Anum_pg_mv_statistic_stahist-1] = true;
+	replaces[Anum_pg_mv_statistic_standist-1] = true;
 
 	/* always change the availability flags */
 	nulls[Anum_pg_mv_statistic_deps_built -1] = false;
 	nulls[Anum_pg_mv_statistic_mcv_built -1] = false;
 	nulls[Anum_pg_mv_statistic_hist_built-1] = false;
+	nulls[Anum_pg_mv_statistic_ndist_built-1] = false;
 	nulls[Anum_pg_mv_statistic_stakeys-1]     = false;
 
 	/* use the new attnums, in case we removed some dropped ones */
 	replaces[Anum_pg_mv_statistic_deps_built-1] = true;
 	replaces[Anum_pg_mv_statistic_mcv_built  -1] = true;
+	replaces[Anum_pg_mv_statistic_ndist_built-1] = true;
 	replaces[Anum_pg_mv_statistic_hist_built -1] = true;
 	replaces[Anum_pg_mv_statistic_stakeys -1]    = true;
 
 	values[Anum_pg_mv_statistic_deps_built-1] = BoolGetDatum(dependencies != NULL);
 	values[Anum_pg_mv_statistic_mcv_built  -1] = BoolGetDatum(mcvlist != NULL);
 	values[Anum_pg_mv_statistic_hist_built -1] = BoolGetDatum(histogram != NULL);
+	values[Anum_pg_mv_statistic_ndist_built-1] = BoolGetDatum(ndistcoeff > 1.0);
 	values[Anum_pg_mv_statistic_stakeys -1]    = PointerGetDatum(attrs);
 
 	/* Is there already a pg_mv_statistic tuple for this attribute? */
diff --git a/src/backend/utils/mvstats/mvdist.c b/src/backend/utils/mvstats/mvdist.c
new file mode 100644
index 0000000..6df7411
--- /dev/null
+++ b/src/backend/utils/mvstats/mvdist.c
@@ -0,0 +1,147 @@
+/*-------------------------------------------------------------------------
+ *
+ * mvdist.c
+ *	  POSTGRES multivariate distinct coefficients
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/mvstats/mvdist.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "common.h"
+#include "utils/lsyscache.h"
+
+/*
+ * 
+ */
+double
+build_mv_ndistinct(int numrows, HeapTuple *rows, int2vector *attrs,
+				   VacAttrStats **stats)
+{
+	int i, j;
+	int numattrs = attrs->dim1;
+	MultiSortSupport mss = multi_sort_init(numattrs);
+	int ndistinct;
+	double result;
+
+	/*
+	 * It's possible to sort the sample rows directly, but this seemed
+	 * somehow simpler / less error prone. Another option would be to
+	 * allocate the arrays for each SortItem separately, but that'd be
+	 * significant overhead (not just CPU, but especially memory bloat).
+	 */
+	SortItem * items = (SortItem*)palloc0(numrows * sizeof(SortItem));
+
+	Datum *values = (Datum*)palloc0(sizeof(Datum) * numrows * numattrs);
+	bool  *isnull = (bool*)palloc0(sizeof(bool) * numrows * numattrs);
+
+	for (i = 0; i < numrows; i++)
+	{
+		items[i].values = &values[i * numattrs];
+		items[i].isnull = &isnull[i * numattrs];
+	}
+
+	Assert(numattrs >= 2);
+
+	for (i = 0; i < numattrs; i++)
+	{
+		/* prepare the sort function for the first dimension */
+		multi_sort_add_dimension(mss, i, i, stats);
+
+		/* accumulate all the data into the array and sort it */
+		for (j = 0; j < numrows; j++)
+		{
+			items[j].values[i]
+				= heap_getattr(rows[j], attrs->values[i],
+							   stats[i]->tupDesc, &items[j].isnull[i]);
+		}
+	}
+
+	qsort_arg((void *) items, numrows, sizeof(SortItem),
+			  multi_sort_compare, mss);
+
+	/* count number of distinct combinations */
+
+	ndistinct = 1;
+	for (i = 1; i < numrows; i++)
+	{
+		if (multi_sort_compare(&items[i], &items[i-1], mss) != 0)
+			ndistinct++;
+	}
+
+	result = 1 / (double)ndistinct;
+
+	/*
+	 * now count distinct values for each attribute and incrementally
+	 * compute ndistinct(a,b) / (ndistinct(a) * ndistinct(b))
+	 */
+	for (i = 0; i < numattrs; i++)
+	{
+		SortSupportData ssup;
+		StdAnalyzeData *tmp = (StdAnalyzeData *)stats[i]->extra_data;
+
+		/* initialize sort support, etc. */
+		memset(&ssup, 0, sizeof(ssup));
+		ssup.ssup_cxt = CurrentMemoryContext;
+
+		/* We always use the default collation for statistics */
+		ssup.ssup_collation = DEFAULT_COLLATION_OID;
+		ssup.ssup_nulls_first = false;
+
+		PrepareSortSupportFromOrderingOp(tmp->ltopr, &ssup);
+
+		memset(values, 0, sizeof(Datum) * numrows);
+
+		/* accumulate all the data into the array and sort it */
+		for (j = 0; j < numrows; j++)
+		{
+			bool isnull;
+			values[j] = heap_getattr(rows[j], attrs->values[i],
+									 stats[i]->tupDesc, &isnull);
+		}
+
+		qsort_arg((void *)values, numrows, sizeof(Datum),
+				  compare_scalars_simple, &ssup);
+
+		ndistinct = 1;
+		for (j = 1; j < numrows; j++)
+		{
+			if (compare_scalars_simple(&values[j], &values[j-1], &ssup) != 0)
+				ndistinct++;
+		}
+
+		result *= ndistinct;
+	}
+
+	return result;
+}
+
+double
+load_mv_ndistinct(Oid mvoid)
+{
+	bool		isnull = false;
+	Datum		deps;
+
+	/* Prepare to scan pg_mv_statistic for entries having indrelid = this rel. */
+	HeapTuple	htup = SearchSysCache1(MVSTATOID, ObjectIdGetDatum(mvoid));
+
+#ifdef USE_ASSERT_CHECKING
+	Form_pg_mv_statistic	mvstat = (Form_pg_mv_statistic) GETSTRUCT(htup);
+	Assert(mvstat->ndist_enabled && mvstat->ndist_built);
+#endif
+
+	deps = SysCacheGetAttr(MVSTATOID, htup,
+						   Anum_pg_mv_statistic_standist, &isnull);
+
+	Assert(!isnull);
+
+	ReleaseSysCache(htup);
+
+	return DatumGetFloat8(deps);
+}
diff --git a/src/include/catalog/pg_mv_statistic.h b/src/include/catalog/pg_mv_statistic.h
index df6a61c..fb9ee22 100644
--- a/src/include/catalog/pg_mv_statistic.h
+++ b/src/include/catalog/pg_mv_statistic.h
@@ -38,6 +38,7 @@ CATALOG(pg_mv_statistic,3381)
 	bool		deps_enabled;		/* analyze dependencies? */
 	bool		mcv_enabled;		/* build MCV list? */
 	bool		hist_enabled;		/* build histogram? */
+	bool		ndist_enabled;		/* build ndist coefficient? */
 
 	/* histogram / MCV size */
 	int32		mcv_max_items;		/* max MCV items */
@@ -47,6 +48,7 @@ CATALOG(pg_mv_statistic,3381)
 	bool		deps_built;			/* dependencies were built */
 	bool		mcv_built;			/* MCV list was built */
 	bool		hist_built;			/* histogram was built */
+	bool		ndist_built;		/* ndistinct coeff built */
 
 	/* variable-length fields start here, but we allow direct access to stakeys */
 	int2vector	stakeys;			/* array of column keys */
@@ -55,6 +57,7 @@ CATALOG(pg_mv_statistic,3381)
 	bytea		stadeps;			/* dependencies (serialized) */
 	bytea		stamcv;				/* MCV list (serialized) */
 	bytea		stahist;			/* MV histogram (serialized) */
+	float8		standcoeff;			/* ndistinct coeff (serialized) */
 #endif
 
 } FormData_pg_mv_statistic;
@@ -70,20 +73,23 @@ typedef FormData_pg_mv_statistic *Form_pg_mv_statistic;
  *		compiler constants for pg_attrdef
  * ----------------
  */
-#define Natts_pg_mv_statistic					14
+#define Natts_pg_mv_statistic					17
 #define Anum_pg_mv_statistic_starelid			1
 #define Anum_pg_mv_statistic_staname			2
 #define Anum_pg_mv_statistic_deps_enabled		3
 #define Anum_pg_mv_statistic_mcv_enabled		4
 #define Anum_pg_mv_statistic_hist_enabled		5
-#define Anum_pg_mv_statistic_mcv_max_items		6
-#define Anum_pg_mv_statistic_hist_max_buckets	7
-#define Anum_pg_mv_statistic_deps_built			8
-#define Anum_pg_mv_statistic_mcv_built			9
-#define Anum_pg_mv_statistic_hist_built			10
-#define Anum_pg_mv_statistic_stakeys			11
-#define Anum_pg_mv_statistic_stadeps			12
-#define Anum_pg_mv_statistic_stamcv				13
-#define Anum_pg_mv_statistic_stahist			14
+#define Anum_pg_mv_statistic_ndist_enabled		6
+#define Anum_pg_mv_statistic_mcv_max_items		7
+#define Anum_pg_mv_statistic_hist_max_buckets	8
+#define Anum_pg_mv_statistic_deps_built			9
+#define Anum_pg_mv_statistic_mcv_built			10
+#define Anum_pg_mv_statistic_hist_built			11
+#define Anum_pg_mv_statistic_ndist_built		12
+#define Anum_pg_mv_statistic_stakeys			13
+#define Anum_pg_mv_statistic_stadeps			14
+#define Anum_pg_mv_statistic_stamcv				15
+#define Anum_pg_mv_statistic_stahist			16
+#define Anum_pg_mv_statistic_standist			17
 
 #endif   /* PG_MV_STATISTIC_H */
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 3706525..6ecbc4e 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -594,11 +594,13 @@ typedef struct MVStatisticInfo
 	bool		deps_enabled;	/* functional dependencies enabled */
 	bool		mcv_enabled;	/* MCV list enabled */
 	bool		hist_enabled;	/* histogram enabled */
+	bool		ndist_enabled;	/* ndistinct coefficient enabled */
 
 	/* built/available statistics */
 	bool		deps_built;		/* functional dependencies built */
 	bool		mcv_built;		/* MCV list built */
 	bool		hist_built;		/* histogram built */
+	bool		ndist_built;	/* ndistinct coefficient built */
 
 	/* columns in the statistics (attnums) */
 	int2vector *stakeys;		/* attnums of the columns covered */
diff --git a/src/include/utils/mvstats.h b/src/include/utils/mvstats.h
index 9fd1314..d3f9de3 100644
--- a/src/include/utils/mvstats.h
+++ b/src/include/utils/mvstats.h
@@ -224,6 +224,7 @@ typedef MVSerializedHistogramData *MVSerializedHistogram;
 MVDependencies load_mv_dependencies(Oid mvoid);
 MCVList        load_mv_mcvlist(Oid mvoid);
 MVSerializedHistogram    load_mv_histogram(Oid mvoid);
+double		   load_mv_ndistinct(Oid mvoid);
 
 bytea * serialize_mv_dependencies(MVDependencies dependencies);
 bytea * serialize_mv_mcvlist(MCVList mcvlist, int2vector *attrs,
@@ -265,11 +266,16 @@ MVHistogram
 build_mv_histogram(int numrows, HeapTuple *rows, int2vector *attrs,
 				   VacAttrStats **stats, int numrows_total);
 
+double
+build_mv_ndistinct(int numrows, HeapTuple *rows, int2vector *attrs,
+				   VacAttrStats **stats);
+
 void build_mv_stats(Relation onerel, int numrows, HeapTuple *rows,
 					int natts, VacAttrStats **vacattrstats);
 
 void update_mv_stats(Oid relid, MVDependencies dependencies,
 					 MCVList mcvlist, MVHistogram histogram,
+					 double ndistcoeff,
 					 int2vector *attrs, VacAttrStats **stats);
 
 #ifdef DEBUG_MVHIST
-- 
2.1.0

