From e287fa474fea487ace0ee7d476f84b6f787cc2a7 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Tue, 24 Dec 2019 22:02:22 -0600
Subject: [PATCH v2 2/2] Pass ndistinct and minmax to allow good estimates even
 with timestamps of granularity other than 1sec

---
 src/backend/optimizer/util/plancat.c |   8 ++-
 src/backend/utils/adt/selfuncs.c     |  36 ++++++++---
 src/backend/utils/adt/timestamp.c    | 113 ++++++++++++++++++++++-------------
 src/include/nodes/supportnodes.h     |   2 +
 src/include/optimizer/plancat.h      |   2 +-
 5 files changed, 110 insertions(+), 51 deletions(-)

diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 2469ca6..aab794c 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -2011,9 +2011,13 @@ get_function_rows(PlannerInfo *root, Oid funcid, Node *node)
 /*
  * Return a multiplier [0..1] to help estimate effect on rowcount of GROUP BY
  * f(x), relative to input x.
+ *
+ * minmax is an array of (min,max) values for the variable, which might be
+ * useful to determine its granularity (like timestamps per second, minute or
+ * hour).
  */
 double
-get_function_groupby(PlannerInfo *root, Oid funcid, Node *node, Node *var)
+get_function_groupby(PlannerInfo *root, Oid funcid, Node *node, Node *var, double ndistinct, Datum *minmax)
 {
 	HeapTuple	proctup;
 	Form_pg_proc procform;
@@ -2033,6 +2037,8 @@ get_function_groupby(PlannerInfo *root, Oid funcid, Node *node, Node *var)
 		req.funcid = funcid;
 		req.node = node;
 		req.var = var;
+		req.ndistinct = ndistinct;
+		req.minmax = minmax;
 		req.factor = 1;			/* just for sanity */
 
 		sresult = (SupportRequestGroupBy *)
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index eb0b86f..a7f396d 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -116,6 +116,7 @@
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
+#include "nodes/supportnodes.h"
 #include "optimizer/clauses.h"
 #include "optimizer/cost.h"
 #include "optimizer/optimizer.h"
@@ -3168,16 +3169,37 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 				Form_pg_statistic stats = (Form_pg_statistic)GETSTRUCT(vardata.statsTuple);
 				FuncExpr   *expr = (FuncExpr *) groupexpr;
 
-				Var *v = (Var*) var;
+				Datum	minmax[2];
+				TypeCacheEntry *tce;
+				Var 	*v = (Var*) var;
 				RangeTblEntry *rte = root->simple_rte_array[v->varno];
-				char *reln = get_rel_name(rte->relid);
-				char *coln = get_attname(rte->relid, v->varattno, true);
-				ret = get_function_groupby(root, expr->funcid, groupexpr, var);
-
-				fprintf(stderr, "HERE %s %d %s.%s ndistinct=%f ret=%f\n", __FUNCTION__, __LINE__,
+				char 	*reln = get_rel_name(rte->relid);
+				char 	*coln = get_attname(rte->relid, v->varattno, true);
+
+				/* Seems like maybe this should be defined here and pass a single argument to groupby helper? */
+				SupportRequestGroupBy req = {
+					.type = T_SupportRequestGroupBy,
+					.root = root,
+					.funcid = expr->funcid,
+					.node = groupexpr,
+					.var = var,
+					.ndistinct = stats->stadistinct >= 0 ? stats->stadistinct :
+						-stats->stadistinct * vardata.rel->tuples,
+					.minmax = minmax,
+					.factor = 1,	/* just for sanity */
+				};
+
+				fprintf(stderr, "HERE %s %d %s.%s tuples=%f, stadistinct=%f ndistinct=%f ret=%f\n", __FUNCTION__, __LINE__,
 						reln?reln:"null",
 						coln?coln:"null",
-						stats->stadistinct, ret);
+						vardata.rel->tuples,
+						stats->stadistinct, req.ndistinct, ret);
+
+				tce = lookup_type_cache(v->vartype, TYPECACHE_LT_OPR);
+				if (get_variable_range(root, &vardata, tce->lt_opr, minmax, minmax+1))
+					ret = get_function_groupby(root, expr->funcid, groupexpr, var, req.ndistinct, minmax);
+				else
+					ret = get_function_groupby(root, expr->funcid, groupexpr, var, req.ndistinct, NULL);
 
 				Assert(ret>=0);
 				Assert(ret<=1);
diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c
index 359ad32..0314ef9 100644
--- a/src/backend/utils/adt/timestamp.c
+++ b/src/backend/utils/adt/timestamp.c
@@ -5563,13 +5563,44 @@ generate_series_timestamptz(PG_FUNCTION_ARGS)
 Datum
 date_trunc_support(PG_FUNCTION_ARGS)
 {
-	Node	   *rawreq;
+	Node		*rawreq;
 	SupportRequestGroupBy *req;
-	List	   *args;
-	Node	   *arg1, *arg2;
-	Node		 	*var;
+	List		*args;
+	Node		*arg1, *arg2;
+	Node	 	*var;
 	char		*start;
-	int			typmod;
+	int			typmod, i;
+	long unsigned int	diff;
+
+	const struct {
+		const char		*name;	/* Granularity name */
+		const int		factor;	/* Multiplier */
+		const double	minsecsec; /* Minimum number of distinct second values per second in range of the histogram */
+	} grans[] = {
+		/* XXX: these factors not applied unless...typmod>1 ? */
+		{ "microseconds", 1 },
+		{ "milliseconds", 1 },
+
+		/*
+		 * These factors not applied unless minmax indicates the incoming
+		 * timestamp is at fine enough granularity that truncating to courser
+		 * granularity would affect result.
+		 */
+		{ "second", 1, 1.0 },
+		{ "minute", 60, 1.0/60 },
+		{ "hour", 60, 1.0/60/60 },
+		{ "day", 24, 1.0/60/60/24 }, // XXX: should not handle for DATEOID
+
+		/* These factors applied up to the specified granularity */
+		{ "week", 7 },
+		{ "month", 30 },
+		{ "quarter", 3 },
+		{ "year", 4 },
+		{ "decade", 10 },
+		{ "century", 10 },
+		{ "millennium", 10 },
+
+	};
 
 	rawreq = (Node *) PG_GETARG_POINTER(0);
 	if (!IsA(rawreq, SupportRequestGroupBy))
@@ -5584,13 +5615,45 @@ date_trunc_support(PG_FUNCTION_ARGS)
 	arg2 = lsecond(args);
 	/* arg3 may be the timezone */
 
-	var = req->var;
+	// XXX: handle if these are null ?
+	diff = req->minmax ? timestamptz_to_time_t(DatumGetTimestamp(req->minmax[1])) -
+			timestamptz_to_time_t(DatumGetTimestamp(req->minmax[0]))
+			: 0;
 
-	/* XXX Assumes the input has 1-second granularity */
+	fprintf(stderr, "got distinct %f diff=%ld\n", req->ndistinct, diff);
 
 	// XXX: only work on const?
 	start = TextDatumGetCString(((Const*)arg1)->constvalue);
 
+	for (i=0; ; ++i) {
+		if (i >= sizeof(grans)/sizeof(*grans))
+			/* Unhandled truncation granularity */
+			PG_RETURN_POINTER(NULL);
+
+		fprintf(stderr, "applying factor %s %d: cur=%f %f -gt %f\n",
+				grans[i].name, grans[i].factor, req->factor,
+				req->ndistinct/diff, grans[i].minsecsec );
+
+		if (req->ndistinct / diff >= grans[i].minsecsec) {
+			if (req->ndistinct / diff > grans[i-1].minsecsec)
+				/* Apply the factor in full strength */
+				req->factor /= grans[i].factor;
+			else {
+				/* interpolate: if at (say) 15 minute granularity, then apply a 4x hourly correction, not 60x */
+				// req->factor /= grans[i].factor / (req->ndistinct / diff / grans[i].minsecsec);
+				req->factor /= req->ndistinct / diff / grans[i].minsecsec; // XXX: is this right
+				fprintf(stderr, "applying partial factor %f\n", req->ndistinct / diff / grans[i].minsecsec);
+			}
+		}
+
+		if (strcmp(start, grans[i].name) == 0)
+			break;
+	}
+
+	PG_RETURN_POINTER(req);
+
+#if 0
+	// var = req->var;
 	// XXX: not working due to promotion ?
 	// exprType(var) is still not right, since date_trunc(a, b::date) uses b's type and not date..
 	// ...but date_trunc('', x::date) is weird
@@ -5604,50 +5667,16 @@ date_trunc_support(PG_FUNCTION_ARGS)
 		/* If the input has decimal digits, the grouping effect is stronger */
 		if (typmod != -1) {
 			req->factor /= 2<<typmod;
-			if (strcmp(start, "microseconds")==0) {
-				/* do nothing? */
-			} else if (strcmp(start, "milliseconds")==0) {
-				/* do nothing? */
-			}
 		}
 
-		if (strcmp(start, "second")==0) {
-			/* do nothing */
-		} else if (strcmp(start, "minute")==0) {
-			req->factor /= 60;
-		} else if (strcmp(start, "hour")==0) {
-			req->factor /= 60*60;
-		}
 	}
 
 	// else { elog(ERROR, "unknown type %u", exprType(var)); }
 
-	if (strcmp(start, "day")==0) {
-		req->factor /= 60*60*24;
-	} else if (strcmp(start, "week")==0) {
-		req->factor /= 60*60*24*7;
-	} else if (strcmp(start, "month")==0) {
-		/* 30 days */
-		req->factor /= 60*60*24*30;
-	} else if (strcmp(start, "quarter")==0) {
-		req->factor /= 60*60*24*30*3;
-	} else if (strcmp(start, "year")==0) {
-		req->factor /= 60*60*24*365.24;
-	} else if (strcmp(start, "decade")==0) {
-		req->factor /= 60*60*24*365.25*10;
-	} else if (strcmp(start, "century")==0) {
-		req->factor /= 60*60*24*365.25*100;
-	} else if (strcmp(start, "millennium")==0) {
-		req->factor /= 60*60*24*365.25*1000;
-	} else if (req->factor > 1) {
 		/* Maybe a DATE with finer graularity trunc */
 		req->factor = 1;
 	}
-	// else { elog(ERROR, "", ); }
 
-	/* Fudge Factor, since the input may be already "grouped", say at multiples of 15min, */
-	/* or otherwise have course granularity to begin with */
-	// factor/=4;
+#endif
 
-	PG_RETURN_POINTER(req);
 }
diff --git a/src/include/nodes/supportnodes.h b/src/include/nodes/supportnodes.h
index cb4ea44..f5f33bf 100644
--- a/src/include/nodes/supportnodes.h
+++ b/src/include/nodes/supportnodes.h
@@ -178,6 +178,8 @@ typedef struct SupportRequestGroupBy
 	Oid			funcid;			/* function we are inquiring about */
 	Node		*var;			/* original (2nd) argument */
 	Node		*node;			/* parse node invoking function */
+	double		ndistinct;		/* Initial estimate of ndistinct of the variable */
+	Datum		*minmax;		/* Array of (min,max) values for variable */
 
 	/* Output fields: */
 	double		factor;			/* [0..1] fraction of rows in GROUP BY f(x)
diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h
index cbe5386..c13d40d 100644
--- a/src/include/optimizer/plancat.h
+++ b/src/include/optimizer/plancat.h
@@ -70,7 +70,7 @@ extern void add_function_cost(PlannerInfo *root, Oid funcid, Node *node,
 
 extern double get_function_rows(PlannerInfo *root, Oid funcid, Node *node);
 
-extern double get_function_groupby(PlannerInfo *root, Oid funcid, Node *node, Node *var);
+extern double get_function_groupby(PlannerInfo *root, Oid funcid, Node *node, Node *var, double ndistinct, Datum *minmax);
 
 extern bool has_row_triggers(PlannerInfo *root, Index rti, CmdType event);
 
-- 
2.7.4

