>From 04207bbe8a5800fcc2c76cdacb77798639113b80 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Thu, 6 Aug 2015 17:12:28 +0900
Subject: [PATCH 2/6] Analyze part for multivariate coefficient.

Calculates multivarite coefficients on ANALYZE and stores them into
pg_mvcoefficient.  This code is considering multiple definitions for
one relation. (However definition part registers no more than one
definition per relation.)

It calculates the following number

 ndistinct(A * B * ...)
----------------------------------
 ndistint(A) * ndistinct(B) * ...
---
 src/backend/commands/analyze.c | 402 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 401 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 861048f..45f7cf5 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -27,6 +27,7 @@
 #include "catalog/indexing.h"
 #include "catalog/pg_collation.h"
 #include "catalog/pg_inherits_fn.h"
+#include "catalog/pg_mvcoefficient.h"
 #include "catalog/pg_namespace.h"
 #include "commands/dbcommands.h"
 #include "commands/tablecmds.h"
@@ -46,6 +47,7 @@
 #include "utils/acl.h"
 #include "utils/attoptcache.h"
 #include "utils/datum.h"
+#include "utils/fmgroids.h"
 #include "utils/guc.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
@@ -56,6 +58,10 @@
 #include "utils/timestamp.h"
 #include "utils/tqual.h"
 
+#ifndef CHAR_BIT
+#include <limits.h>
+#endif
+
 
 /* Per-index data for ANALYZE */
 typedef struct AnlIndexData
@@ -96,7 +102,11 @@ static void update_attstats(Oid relid, bool inh,
 				int natts, VacAttrStats **vacattrstats);
 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
-
+static float4 compute_mv_ndistinct(int nattrs,
+								   VacAttrStats **colstats,
+								   AnalyzeAttrFetchFunc fetchfunc,
+								   int samplerows,
+								   double totalrows);
 
 /*
  *	analyze_rel() -- analyze one relation
@@ -281,6 +291,128 @@ analyze_rel(Oid relid, RangeVar *relation, int options,
 	LWLockRelease(ProcArrayLock);
 }
 
+/* Compute and store multivariate distinctness */
+static void
+update_mv_distinct(Oid reloid, VacAttrStats **vacattrstats,
+				   int attr_cnt, int numrows, double totalrows)
+{
+	ScanKeyData	scankey;
+	SysScanDesc	sysscan;
+	Relation	mvcrel;
+	HeapTuple	oldtup, newtup;
+
+	mvcrel = heap_open(MvCoefficientRelationId, RowExclusiveLock);
+
+	ScanKeyInit(&scankey,
+				Anum_pg_mvcoefficient_mvcreloid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(reloid));
+	sysscan = systable_beginscan(mvcrel, MvCoefficientIndexId, true,
+								 NULL, 1, &scankey);
+	oldtup = systable_getnext(sysscan);
+
+	while (HeapTupleIsValid(oldtup))
+	{
+		VacAttrStats *colstats[MVCOEF_MAX_COLS];
+		int		ncols;
+		double	nd_cartesian, nd, ncol_totalrows;
+		Datum	values[Natts_pg_mvcoefficient];
+		bool	nulls[Natts_pg_mvcoefficient];
+		bool	replaces[Natts_pg_mvcoefficient];
+		int		i;
+		float4	nd_coef;
+				
+		Form_pg_mvcoefficient mvc =
+			(Form_pg_mvcoefficient) GETSTRUCT (oldtup);
+
+		Assert(mvc->mvcnattrs > 1 && mvc->mvcnattrs <= MVCOEF_MAX_COLS);
+		ncols = mvc->mvcnattrs;
+
+		/* find vacattrstats entry for required columns */
+		for (i = 0 ; i < ncols ; i++)
+		{
+			int j;
+
+			for(j = 0 ; j < attr_cnt ; j++)
+			{
+				if (mvc->mvcattrs.values[i] == vacattrstats[j]->tupattnum)
+				{
+					colstats[i] = vacattrstats[j];
+					break;
+				}
+			}
+
+			/* required column is not in vacattrstats, give up */
+			if (j == attr_cnt)
+				break;
+		}
+
+		/* This mvdistinct is not computable with this vacattrstats. */
+		if (i != ncols)
+		{
+			oldtup = systable_getnext(sysscan);
+			continue;
+		}
+
+		/*
+		 * compute cartesian ndistinct =
+		 *      ndistinct(col0) * ndistinct(col1) ...
+		 *
+		 * This calculation is performed with distinct ratio to preserve
+		 * precision.
+		 */
+		nd_cartesian = 1.0;
+		for (i = 0 ; i < ncols ; i++)
+		{
+			double t = -colstats[i]->stadistinct;
+
+			/* ignore colums where stadistinct is "unknown" */
+			if (t == 0.0) continue;
+
+			if (t < 0)
+				t = -t / totalrows;
+
+			nd_cartesian *= t;
+		}
+
+		nd_coef = 1.0;
+
+		/* compute ndistinct(col0 * col1 * ... ) */
+		nd = compute_mv_ndistinct(i, colstats,
+								  std_fetch_func, numrows, totalrows);
+
+		ncol_totalrows = pow(totalrows, ncols);
+		if (!isinf(ncol_totalrows))	/* This won't be happen? */
+			nd_coef = nd / nd_cartesian / ncol_totalrows;
+		else
+			nd_coef = 0.0;
+
+		/* Update statistics record */
+		for (i = 0; i < Natts_pg_mvcoefficient ; ++i)
+		{
+			nulls[i] = false;
+			replaces[i] = false;
+		}
+		values[Anum_pg_mvcoefficient_mvccoefficient - 1] =
+			Float8GetDatum(nd_coef);
+		replaces[Anum_pg_mvcoefficient_mvccoefficient - 1] = true;
+		newtup = heap_modify_tuple(oldtup,
+								   RelationGetDescr(mvcrel),
+								   values,
+								   nulls,
+								   replaces);
+		simple_heap_update(mvcrel, &oldtup->t_self, newtup);
+				
+		CatalogUpdateIndexes(mvcrel, newtup);
+				
+		oldtup = systable_getnext(sysscan);
+	}
+
+	systable_endscan(sysscan);
+	heap_close(mvcrel, RowExclusiveLock);
+}
+
+
 /*
  *	do_analyze_rel() -- analyze one relation, recursively or not
  *
@@ -538,6 +670,10 @@ do_analyze_rel(Relation onerel, int options, VacuumParams *params,
 			MemoryContextResetAndDeleteChildren(col_context);
 		}
 
+		/* update multivariate distinctness if requested */
+		update_mv_distinct(onerel->rd_id, vacattrstats, attr_cnt,
+						   numrows, totalrows);
+
 		if (hasindex)
 			compute_index_stats(onerel, totalrows,
 								indexdata, nindexes,
@@ -1676,6 +1812,16 @@ typedef struct
 	int			tupno;			/* position index for tuple it came from */
 } ScalarItem;
 
+#define VI_UBITS	(CHAR_BIT * sizeof(unsigned))
+#define VI_BSET(bm,an) ((bm)[(an)/VI_UBITS] |= (unsigned)1 << ((an)%VI_UBITS))
+#define VI_ISBSET(bm,an) ((bm)[(an)/VI_UBITS] & ((unsigned)1 << ((an)%VI_UBITS)))
+typedef struct
+{
+	unsigned	nulls[MVCOEF_MAX_COLS / VI_UBITS + 1]; /* null bitmap */
+	Datum	   *datums;			/* Values list */
+	int			tupno;			/* position index for tuple it came from */
+} VectorItem;
+
 typedef struct
 {
 	int			count;			/* # of duplicates */
@@ -1698,6 +1844,7 @@ static void compute_scalar_stats(VacAttrStatsP stats,
 					 int samplerows,
 					 double totalrows);
 static int	compare_scalars(const void *a, const void *b, void *arg);
+static int	compare_mv_scalars(const void *a, const void *b, void *arg);
 static int	compare_mcvs(const void *a, const void *b);
 
 
@@ -2628,6 +2775,220 @@ compute_scalar_stats(VacAttrStatsP stats,
 }
 
 /*
+ *	compute_mv_ndistinct() -- compute multicolumn distinctness
+ */ 
+static float4
+compute_mv_ndistinct(int nattrs,
+					 VacAttrStats **colstats,
+					 AnalyzeAttrFetchFunc fetchfunc,
+					 int samplerows,
+					 double totalrows)
+{
+	int			i;
+	int			null_cnt = 0;
+	int			nonnull_cnt = 0;
+	int			toowide_cnt = 0;
+	bool		is_varlena[MVCOEF_MAX_COLS];
+	SortSupportData ssup[MVCOEF_MAX_COLS];
+	Datum	   *datums;
+	VectorItem *values;
+	int			values_cnt = 0;
+	int		   *tupnoLink;
+	StdAnalyzeData *mystats[MVCOEF_MAX_COLS];
+	float4		fndistinct;
+
+	Assert (nattrs > 1 && nattrs <= MVCOEF_MAX_COLS);
+
+	/* set up row storage for sorting */
+	datums = (Datum*) palloc(samplerows * nattrs * sizeof(Datum));
+	values = (VectorItem *) palloc0(samplerows * sizeof(VectorItem));
+	tupnoLink = (int *) palloc(samplerows * sizeof(int));
+
+	for (i = 0 ; i < samplerows ; i++)
+		values[i].datums = &datums[i * nattrs];
+
+	memset(ssup, 0, sizeof(ssup));
+	for (i = 0 ; i < nattrs ; i++)
+	{
+		VacAttrStats *vas = colstats[i];
+
+		is_varlena[i] =
+			!vas->attrtype->typbyval && vas->attrtype->typlen == -1;
+		mystats[i] =
+			(StdAnalyzeData*) vas->extra_data;
+
+		ssup[i].ssup_cxt = CurrentMemoryContext;
+		/* We always use the default collation for statistics */
+		ssup[i].ssup_collation = DEFAULT_COLLATION_OID;
+		/* Don't use abbreviated key conversion for now.  See
+		   compute_scalar_stats. */
+		/* either will do for sort order and nulls_first */
+		PrepareSortSupportFromOrderingOp(mystats[i]->ltopr, &ssup[i]);
+	}
+	ssup[nattrs].ssup_cxt = NULL;
+
+	/* Initial scan to find sortable values */
+	for (i = 0; i < samplerows; i++)
+	{
+		Datum	colvalues[nattrs];
+		bool	colnulls[nattrs];
+		bool	all_is_null = true;
+		bool	toowide = false;
+		int 	j;
+
+		vacuum_delay_point();
+
+		for (j = 0 ; j < nattrs ; j++)
+		{
+			colnulls[nattrs] = false;
+
+			colvalues[j] = fetchfunc(colstats[j], i, &colnulls[j]);
+
+			/*
+			 * Check for null/nonnull. Don't abandon this row if null is
+			 * contained
+			 */
+			if (colnulls[j])
+				continue;
+
+			all_is_null = false;
+
+			/*
+			 * If it's a variable-width field, detoast it now to avoid
+			 * repeated detoasting during the comparisons. If at least one of
+			 * the columns is too long, exclude this row from accurate
+			 * counting.
+			 */
+			if (is_varlena[j])
+			{
+				if (toast_raw_datum_size(colvalues[j]) > WIDTH_THRESHOLD)
+				{
+					toowide = true;
+					break;
+				}
+				colvalues[j] = PointerGetDatum(PG_DETOAST_DATUM(colvalues[j]));
+			}
+		}
+		if (all_is_null)
+		{
+			null_cnt++;
+			continue;
+		}
+		nonnull_cnt++;
+
+		if (toowide)
+		{
+			toowide_cnt++;
+			continue;
+		}
+
+		/* Add it to the list to be sorted */
+		for (j = 0 ; j < nattrs ; j++)
+		{
+			if (colnulls[j])
+				VI_BSET(values[values_cnt].nulls, j);
+			else
+				values[values_cnt].datums[j] = colvalues[j];
+		}
+
+		values[values_cnt].tupno = values_cnt;
+		tupnoLink[values_cnt] = values_cnt;
+		values_cnt++;
+	}
+
+	/* We can only compute real stats if we found some sortable values. */
+	if (values_cnt > 0)
+	{
+		int			ndistinct,	/* # distinct values in sample */
+					nmultiple,	/* # that appear multiple times */
+					dups_cnt;
+		CompareScalarsContext cxt;
+
+		/* Sort the collected values */
+		cxt.ssup = ssup;
+		cxt.tupnoLink = tupnoLink;
+		qsort_arg((void *) values, values_cnt, sizeof(VectorItem),
+				  compare_mv_scalars, (void *) &cxt);
+
+		ndistinct = 0;
+		nmultiple = 0;
+		dups_cnt = 0;
+		for (i = 0; i < values_cnt; i++)
+		{
+			int			tupno = values[i].tupno;
+
+			dups_cnt++;
+			if (tupnoLink[tupno] == tupno)
+			{
+				/* Reached end of duplicates of this value */
+				ndistinct++;
+				if (dups_cnt > 1)
+					nmultiple++;
+
+				dups_cnt = 0;
+			}
+		}
+
+		if (nmultiple == 0)
+		{
+			/* If we found no repeated values, assume it's a unique column */
+			fndistinct = totalrows;
+		}
+		else if (toowide_cnt == 0 && nmultiple == ndistinct)
+		{
+			/*
+			 * Every value in the sample appeared more than once.  Assume the
+			 * columns have just these values.
+			 */
+			fndistinct = (float4)ndistinct;
+		}
+		else
+		{
+			/*----------
+			 * Estimate the number of distinct values using the estimator.
+			 * See compute_scalar_stats for the algorithm.
+			 *----------
+			 */
+			int			f1 = ndistinct - nmultiple + toowide_cnt;
+			int			d = f1 + nmultiple;
+			double		numer,
+						denom,
+						stadistinct;
+
+			numer = (double) samplerows *(double) d;
+
+			denom = (double) (samplerows - f1) +
+				(double) f1 *(double) samplerows / totalrows;
+
+			stadistinct = numer / denom;
+			/* Clamp to sane range in case of roundoff error */
+			if (stadistinct < (double) d)
+				stadistinct = (double) d;
+			if (stadistinct > totalrows)
+				stadistinct = totalrows;
+			fndistinct = floor(stadistinct + 0.5);
+		}
+	}
+	else if (nonnull_cnt > 0)
+	{
+		/* We found some non-null values, but they were all too wide */
+		Assert(nonnull_cnt == toowide_cnt);
+
+		/* Assume all too-wide values are distinct, so all rows are unique */
+		fndistinct = totalrows;
+	}
+	else if (null_cnt > 0)
+	{
+		/* We found only all-null rows; assume the rows are entirely null */
+		fndistinct =  0.0;		/* "unknown" */
+	}
+
+	/* We don't need to bother cleaning up any of our temporary palloc's */
+	return fndistinct;
+}
+
+
+/*
  * qsort_arg comparator for sorting ScalarItems
  *
  * Aside from sorting the items, we update the tupnoLink[] array
@@ -2664,6 +3025,45 @@ compare_scalars(const void *a, const void *b, void *arg)
 	return ta - tb;
 }
 
+static int
+compare_mv_scalars(const void *a, const void *b, void *arg)
+{
+	CompareScalarsContext *cxt = (CompareScalarsContext *) arg;
+	VectorItem *va = (VectorItem*)a;
+	VectorItem *vb = (VectorItem*)b;
+	Datum		da, db;
+	int			ta, tb;
+	int			compare;
+	int i;
+
+	for (i = 0 ; cxt->ssup[i].ssup_cxt ; i++)
+	{
+		bool da_isnull = VI_ISBSET(va->nulls, i);
+		bool db_isnull = VI_ISBSET(vb->nulls, i);
+		da = va->datums[i];
+		db = vb->datums[i];
+
+		compare = ApplySortComparator(da, da_isnull, db, db_isnull, &cxt->ssup[i]);
+		if (compare != 0)
+			return compare;
+	}
+
+	/*
+	 * The two datums are equal, so update cxt->tupnoLink[].
+	 */
+	ta = va->tupno;
+	tb = vb->tupno;
+	if (cxt->tupnoLink[ta] < tb)
+		cxt->tupnoLink[ta] = tb;
+	if (cxt->tupnoLink[tb] < ta)
+		cxt->tupnoLink[tb] = ta;
+
+	/*
+	 * For equal datums, sort by tupno
+	 */
+	return ta - tb;
+}
+
 /*
  * qsort comparator for sorting ScalarMCVItems by position
  */
-- 
1.8.3.1

