From 136f0d0b04a8592220f0975a3529f4e34a227f48 Mon Sep 17 00:00:00 2001
From: Corey Huinker <corey.huinker@gmail.com>
Date: Tue, 17 Dec 2024 03:30:55 -0500
Subject: [PATCH v4 1/4] Add working input function for pg_ndistinct.

This is needed to import extended statistics.
---
 .../statistics/extended_stats_internal.h      |   8 +
 src/backend/statistics/mvdistinct.c           | 359 +++++++++++++++++-
 src/test/regress/expected/stats_ext.out       |   7 +
 src/test/regress/sql/stats_ext.sql            |   3 +
 4 files changed, 371 insertions(+), 6 deletions(-)

diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index efcb7dc3546..396915a8a97 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -127,4 +127,12 @@ extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root,
 											 Selectivity *overlap_basesel,
 											 Selectivity *totalsel);
 
+extern Datum import_mcvlist(HeapTuple tup, int elevel, int numattrs,
+							Oid *atttypids, int32 *atttypmods, Oid *atttypcolls,
+							int nitems, Datum *mcv_elems, bool *mcv_nulls,
+							bool *mcv_elem_nulls, float8 *freqs, float8 *base_freqs);
+extern bool pg_ndistinct_validate_items(MVNDistinct *ndistinct, int2vector *stxkeys,
+										int numexprs, int elevel);
+extern void free_pg_ndistinct(MVNDistinct *ndistinct);
+
 #endif							/* EXTENDED_STATS_INTERNAL_H */
diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c
index 7e7a63405c8..e9c02aaa63e 100644
--- a/src/backend/statistics/mvdistinct.c
+++ b/src/backend/statistics/mvdistinct.c
@@ -27,10 +27,19 @@
 
 #include "catalog/pg_statistic_ext.h"
 #include "catalog/pg_statistic_ext_data.h"
+#include "common/int.h"
+#include "common/jsonapi.h"
+#include "fmgr.h"
 #include "lib/stringinfo.h"
+#include "mb/pg_wchar.h"
+#include "nodes/miscnodes.h"
+#include "nodes/pg_list.h"
 #include "statistics/extended_stats_internal.h"
 #include "statistics/statistics.h"
+#include "utils/builtins.h"
+#include "utils/float.h"
 #include "utils/fmgrprotos.h"
+#include "utils/palloc.h"
 #include "utils/syscache.h"
 #include "utils/typcache.h"
 #include "varatt.h"
@@ -328,23 +337,361 @@ statext_ndistinct_deserialize(bytea *data)
 	return ndistinct;
 }
 
+typedef struct
+{
+	const char *str;
+	bool		found_only_object;
+	List	   *distinct_items;
+	Node	   *escontext;
+
+	MVNDistinctItem *current_item;
+}			ndistinctParseState;
+
+/*
+ * Invoked at the start of each object in the JSON document.
+ * The entire JSON document should be one object with no sub-objects.
+ *
+ * If we're anywhere else in the document, it's an error.
+ */
+static JsonParseErrorType
+ndistinct_object_start(void *state)
+{
+	ndistinctParseState *parse = state;
+
+	if (parse->found_only_object == true)
+	{
+		ereturn(parse->escontext, (Datum) 0,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("malformed pg_ndistinct: \"%s\"", parse->str),
+				 errdetail("Must begin with \"{\"")));
+		return JSON_SEM_ACTION_FAILED;
+	}
+
+	parse->found_only_object = true;
+	return JSON_SUCCESS;
+}
+
+/*
+ * ndsitinct input format does not have arrays, so any array elements encountered
+ * are an error.
+ */
+static JsonParseErrorType
+ndistinct_array_start(void *state)
+{
+	ndistinctParseState *parse = state;
+
+	ereturn(parse->escontext, (Datum) 0,
+			(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+			 errmsg("malformed pg_ndistinct: \"%s\"", parse->str),
+			 errdetail("All ndistinct count values are scalar doubles.")));
+	return JSON_SEM_ACTION_FAILED;
+}
+
+static int
+attnum_compare(const void *aptr, const void *bptr)
+{
+	AttrNumber	a = *(const AttrNumber *) aptr;
+	AttrNumber	b = *(const AttrNumber *) bptr;
+
+	return pg_cmp_s16(a,b);
+}
+
+/*
+ * The object keys are themselves comma-separated lists of attnums
+ * with negative attnums representing one of the expressions defined
+ * in the extened statistics object.
+ */
+static JsonParseErrorType
+ndistinct_object_field_start(void *state, char *fname, bool isnull)
+{
+	ndistinctParseState *parse = state;
+	char	   *token;
+	char	   *saveptr;
+	const char *delim = ", ";
+	char	   *scratch;
+	List	   *attnum_list = NIL;
+	int			natts = 0;
+	MVNDistinctItem *item;
+	AttrNumber *attrsort;
+
+	if (isnull || fname == NULL)
+	{
+		ereturn(parse->escontext, (Datum) 0,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("malformed pg_ndistinct: \"%s\"", parse->str),
+				 errdetail("All ndistinct attnum lists must be a comma separated list of attnums.")));
+
+		return JSON_SEM_ACTION_FAILED;
+	}
+
+	scratch = pstrdup(fname);
+
+	token = strtok_r(scratch, delim, &saveptr);
+
+	while (token != NULL)
+	{
+		attnum_list = lappend(attnum_list, (void *) token);
+
+		token = strtok_r(NULL, delim, &saveptr);
+	}
+	natts = attnum_list->length;
+
+	/*
+	 * We need at least 2 attnums for a ndistinct item, anything less is
+	 * malformed.
+	 */
+	if (natts < 2)
+	{
+		ereturn(parse->escontext, (Datum) 0,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("malformed pg_ndistinct: \"%s\"", parse->str),
+				 errdetail("All ndistinct attnum lists must be a comma separated list of attnums.")));
+
+		return JSON_SEM_ACTION_FAILED;
+	}
+
+	item = palloc(sizeof(MVNDistinctItem));
+	item->nattributes = natts;
+	item->attributes = palloc0(natts * sizeof(AttrNumber));
+	attrsort = palloc0(natts * sizeof(AttrNumber));
+
+	for (int i = 0; i < natts; i++)
+	{
+		char	   *s = (char *) attnum_list->elements[i].ptr_value;
+
+		attrsort[i] = pg_strtoint16_safe(s, parse->escontext);
+		item->attributes[i] = attrsort[i];
+
+		if (SOFT_ERROR_OCCURRED(parse->escontext))
+			return JSON_SEM_ACTION_FAILED;
+	}
+
+	list_free(attnum_list);
+	pfree(scratch);
+
+	qsort(attrsort,natts,sizeof(AttrNumber),attnum_compare);
+	for (int i = 1; i < natts; i++)
+		if (attrsort[i] == attrsort[i-1])
+		{
+			ereturn(parse->escontext, (Datum) 0,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					errmsg("malformed pg_ndistinct: \"%s\"", parse->str),
+					errdetail("attnum list duplicate value found: %d", attrsort[i])));
+
+			return JSON_SEM_ACTION_FAILED;
+		}
+
+	pfree(attrsort);
+
+	/* add ndistinct-less MVNDistinctItem to the list */
+	parse->current_item = item;
+	parse->distinct_items = lappend(parse->distinct_items, (void *) item);
+	return JSON_SUCCESS;
+}
+
+/*
+ * ndsitinct input format does not have arrays, so any array elements encountered
+ * are an error.
+ */
+static JsonParseErrorType
+ndistinct_array_element_start(void *state, bool isnull)
+{
+	ndistinctParseState *parse = state;
+
+	ereturn(parse->escontext, (Datum) 0,
+			(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+			 errmsg("malformed pg_ndistinct: \"%s\"", parse->str),
+			 errdetail("Cannot contain array elements.")));
+
+	return JSON_SEM_ACTION_FAILED;
+}
+
+/*
+ * Handle scalar events from the ndistinct input parser.
+ *
+ * There is only one case where we will encounter a scalar, and that is the
+ * ndsitinct value for the previous object key.
+ */
+static JsonParseErrorType
+ndistinct_scalar(void *state, char *token, JsonTokenType tokentype)
+{
+	ndistinctParseState *parse = state;
+
+	/* if the entire json is just one scalar, that's wrong */
+	if (parse->found_only_object != true)
+	{
+		ereturn(parse->escontext, (Datum) 0,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("malformed pg_ndistinct: \"%s\"", parse->str),
+				 errdetail("Must begin with \"{\"")));
+
+		return JSON_SEM_ACTION_FAILED;
+	}
+
+	Assert(parse->current_item != NULL);
+
+	parse->current_item->ndistinct = float8in_internal(token, NULL, "double",
+													   token, parse->escontext);
+
+	if (SOFT_ERROR_OCCURRED(parse->escontext))
+		return JSON_SEM_ACTION_FAILED;
+
+	/* mark us done with this item */
+	parse->current_item = NULL;
+	return JSON_SUCCESS;
+}
+
 /*
  * pg_ndistinct_in
  *		input routine for type pg_ndistinct
  *
- * pg_ndistinct is real enough to be a table column, but it has no
- * operations of its own, and disallows input (just like pg_node_tree).
+ * example input: {"6, -1": 14, "6, -2": 9143, "-1, -2": 13454, "6, -1, -2": 14549}
+ *
+ * This import format is clearly a specific subset of JSON, therefore it makes
+ * sense to leverage those parsing utilities, and further validate it from there.
  */
 Datum
 pg_ndistinct_in(PG_FUNCTION_ARGS)
 {
-	ereport(ERROR,
-			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-			 errmsg("cannot accept a value of type %s", "pg_ndistinct")));
+	char	   *str = PG_GETARG_CSTRING(0);
 
-	PG_RETURN_VOID();			/* keep compiler quiet */
+	ndistinctParseState parse_state;
+	JsonParseErrorType result;
+	JsonLexContext *lex;
+	JsonSemAction sem_action;
+
+	/* initialize semantic state */
+	parse_state.str = str;
+	parse_state.found_only_object = false;
+	parse_state.distinct_items = NIL;
+	parse_state.escontext = fcinfo->context;
+	parse_state.current_item = NULL;
+
+	/* set callbacks */
+	sem_action.semstate = (void *) &parse_state;
+	sem_action.object_start = ndistinct_object_start;
+	sem_action.object_end = NULL;
+	sem_action.array_start = ndistinct_array_start;
+	sem_action.array_end = NULL;
+	sem_action.object_field_start = ndistinct_object_field_start;
+	sem_action.object_field_end = NULL;
+	sem_action.array_element_start = ndistinct_array_element_start;
+	sem_action.array_element_end = NULL;
+	sem_action.scalar = ndistinct_scalar;
+
+	lex = makeJsonLexContextCstringLen(NULL, str, strlen(str),
+									   PG_UTF8, true);
+	result = pg_parse_json(lex, &sem_action);
+	freeJsonLexContext(lex);
+	if (result == JSON_SUCCESS)
+	{
+		MVNDistinct *ndistinct;
+		int			nitems = parse_state.distinct_items->length;
+		bytea	   *bytes;
+
+		ndistinct = palloc(offsetof(MVNDistinct, items) +
+						   nitems * sizeof(MVNDistinctItem));
+
+		ndistinct->magic = STATS_NDISTINCT_MAGIC;
+		ndistinct->type = STATS_NDISTINCT_TYPE_BASIC;
+		ndistinct->nitems = nitems;
+
+		for (int i = 0; i < nitems; i++)
+		{
+			MVNDistinctItem *item = parse_state.distinct_items->elements[i].ptr_value;
+
+			ndistinct->items[i].ndistinct = item->ndistinct;
+			ndistinct->items[i].nattributes = item->nattributes;
+			ndistinct->items[i].attributes = item->attributes;
+
+			/*
+			 * free the MVNDistinctItem, but not the attributes we're still
+			 * using
+			 */
+			pfree(item);
+		}
+		bytes = statext_ndistinct_serialize(ndistinct);
+
+		list_free(parse_state.distinct_items);
+		for (int i = 0; i < nitems; i++)
+			pfree(ndistinct->items[i].attributes);
+		pfree(ndistinct);
+
+		PG_RETURN_BYTEA_P(bytes);
+	}
+	else if (result == JSON_SEM_ACTION_FAILED)
+		PG_RETURN_NULL();		/* escontext already set */
+
+	/* Anything else is a generic JSON parse error */
+	ereturn(parse_state.escontext, (Datum) 0,
+			(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+			 errmsg("malformed pg_ndistinct: \"%s\"", str),
+			 errdetail("Must be valid JSON.")));
+	PG_RETURN_NULL();
 }
 
+/*
+ * Free allocations of an MVNDistinct
+ */
+void
+free_pg_ndistinct(MVNDistinct *ndistinct)
+{
+	for (int i = 0; i < ndistinct->nitems; i++)
+		pfree(ndistinct->items[i].attributes);
+
+	pfree(ndistinct);
+}
+
+/*
+ * Validate an MVNDistinct against the extended statistics object definition.
+ *
+ * Every MVNDistinctItem must be checked to ensure that the attnums in the
+ * attributes list correspond to attnums/expressions defined by the
+ * extended statistics object.
+ *
+ * Positive attnums are attributes which must be found in the stxkeys,
+ * while negative attnums correspond to an expr number, so the attnum 
+ * can't be below (0 - numexprs).
+ */
+bool
+pg_ndistinct_validate_items(MVNDistinct *ndistinct, int2vector *stxkeys, int numexprs, int elevel)
+{
+	int		attnum_expr_lowbound = 0 - numexprs;
+
+	for (int i = 0; i < ndistinct->nitems; i++)
+	{
+		MVNDistinctItem item = ndistinct->items[i];
+
+		for (int j = 0; j < item.nattributes; j++)
+		{
+			AttrNumber	attnum = item.attributes[j];
+			bool		ok = false;
+
+			if (attnum > 0)
+			{
+				for (int k = 0; k < stxkeys->dim1; k++)
+					if (attnum == stxkeys->values[k])
+					{
+						ok = true;
+						break;
+					}
+			}
+			else if ((attnum < 0) && (attnum >= attnum_expr_lowbound))
+				ok = true;
+
+			if (!ok)
+			{
+				ereport(elevel,
+						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+						errmsg("pg_ndistinct: invalid attnum for this statistics object: %d", attnum)));
+				return false;
+			}
+		}
+	}
+	return true;
+}
+
+
 /*
  * pg_ndistinct
  *		output routine for type pg_ndistinct
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 904d3e623f5..20333667e5f 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -3358,6 +3358,13 @@ SELECT statistics_name, most_common_vals FROM pg_stats_ext_exprs x
  s_expr          | {1}
 (2 rows)
 
+-- new input functions
+SELECT '{"6, -1": 14, "6, -2": 9143, "-1, -2": 13454, "6, -1, -2": 14549}'::pg_ndistinct;
+                           pg_ndistinct                            
+-------------------------------------------------------------------
+ {"6, -1": 14, "6, -2": 9143, "-1, -2": 13454, "6, -1, -2": 14549}
+(1 row)
+
 -- Tidy up
 DROP OPERATOR <<< (int, int);
 DROP FUNCTION op_leak(int, int);
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 88b33ccaef8..3539d7b5cd2 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1700,6 +1700,9 @@ SELECT statistics_name, most_common_vals FROM pg_stats_ext x
 SELECT statistics_name, most_common_vals FROM pg_stats_ext_exprs x
     WHERE tablename = 'stats_ext_tbl' ORDER BY ROW(x.*);
 
+-- new input functions
+SELECT '{"6, -1": 14, "6, -2": 9143, "-1, -2": 13454, "6, -1, -2": 14549}'::pg_ndistinct;
+
 -- Tidy up
 DROP OPERATOR <<< (int, int);
 DROP FUNCTION op_leak(int, int);

base-commit: b229c10164770769c3b5033785917ca7a43a2471
-- 
2.48.1

