From 03c8334c4504afe60fcd2a7f1230aaa750900ec3 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Tue, 31 Dec 2019 18:49:41 -0600
Subject: [PATCH v9 2/8] explain to show tuplehash bucket and memory stats..

Note that hashed SubPlan and recursiveUnion aren't affected in explain output,
probably since hashtables aren't allocated at that point.

Discussion: https://www.postgresql.org/message-id/flat/20200103161925.GM12066@telsasoft.com
---
 src/backend/commands/explain.c            | 173 +++++++++++++++++-----
 src/backend/executor/execGrouping.c       |  33 +++++
 src/backend/executor/nodeAgg.c            |  17 +--
 src/backend/executor/nodeRecursiveunion.c |   3 +
 src/backend/executor/nodeSetOp.c          |   1 +
 src/backend/executor/nodeSubplan.c        |   3 +
 src/include/executor/executor.h           |   1 +
 src/include/nodes/execnodes.h             |  11 +-
 8 files changed, 194 insertions(+), 48 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 455f54ef83..ecc0469d35 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -18,6 +18,7 @@
 #include "commands/createas.h"
 #include "commands/defrem.h"
 #include "commands/prepare.h"
+#include "executor/nodeAgg.h"
 #include "executor/nodeHash.h"
 #include "foreign/fdwapi.h"
 #include "jit/jit.h"
@@ -88,12 +89,14 @@ static void show_merge_append_keys(MergeAppendState *mstate, List *ancestors,
 								   ExplainState *es);
 static void show_agg_keys(AggState *astate, List *ancestors,
 						  ExplainState *es);
-static void show_grouping_sets(PlanState *planstate, Agg *agg,
+static void show_grouping_sets(AggState *aggstate, Agg *agg,
 							   List *ancestors, ExplainState *es);
-static void show_grouping_set_keys(PlanState *planstate,
+static void show_grouping_set_info(AggState *aggstate,
 								   Agg *aggnode, Sort *sortnode,
 								   List *context, bool useprefix,
-								   List *ancestors, ExplainState *es);
+								   List *ancestors,
+								   HashTableInstrumentation *inst,
+								   ExplainState *es);
 static void show_group_keys(GroupState *gstate, List *ancestors,
 							ExplainState *es);
 static void show_sort_group_keys(PlanState *planstate, const char *qlabel,
@@ -108,7 +111,8 @@ static void show_sort_info(SortState *sortstate, ExplainState *es);
 static void show_incremental_sort_info(IncrementalSortState *incrsortstate,
 									   ExplainState *es);
 static void show_hash_info(HashState *hashstate, ExplainState *es);
-static void show_hashagg_info(AggState *hashstate, ExplainState *es);
+static void show_tuplehash_info(HashTableInstrumentation *inst, AggState *as,
+		ExplainState *es);
 static void show_tidbitmap_info(BitmapHeapScanState *planstate,
 								ExplainState *es);
 static void show_instrumentation_count(const char *qlabel, int which,
@@ -1535,6 +1539,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 					appendStringInfo(es->str, " %s", setopcmd);
 				else
 					ExplainPropertyText("Command", setopcmd, es);
+				// show strategy in text mode ?
 			}
 			break;
 		default:
@@ -1928,11 +1933,24 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_Agg:
 			show_agg_keys(castNode(AggState, planstate), ancestors, es);
 			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
-			show_hashagg_info((AggState *) planstate, es);
 			if (plan->qual)
 				show_instrumentation_count("Rows Removed by Filter", 1,
 										   planstate, es);
 			break;
+		case T_SetOp:
+			{
+				SetOpState *sos = castNode(SetOpState, planstate);
+				if (sos->hashtable)
+					show_tuplehash_info(&sos->hashtable->instrument, NULL, es);
+			}
+			break;
+		case T_RecursiveUnion:
+			{
+				RecursiveUnionState *rus = (RecursiveUnionState *)planstate;
+				if (rus->hashtable)
+					show_tuplehash_info(&rus->hashtable->instrument, NULL, es);
+			}
+			break;
 		case T_Group:
 			show_group_keys(castNode(GroupState, planstate), ancestors, es);
 			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
@@ -2337,24 +2355,31 @@ show_agg_keys(AggState *astate, List *ancestors,
 		ancestors = lcons(plan, ancestors);
 
 		if (plan->groupingSets)
-			show_grouping_sets(outerPlanState(astate), plan, ancestors, es);
+			show_grouping_sets(astate, plan, ancestors, es);
 		else
+		{
 			show_sort_group_keys(outerPlanState(astate), "Group Key",
 								 plan->numCols, 0, plan->grpColIdx,
 								 NULL, NULL, NULL,
 								 ancestors, es);
+			Assert(astate->num_hashes <= 1);
+			if (astate->num_hashes)
+				show_tuplehash_info(&astate->perhash[0].hashtable->instrument, astate, es);
+		}
 
 		ancestors = list_delete_first(ancestors);
 	}
 }
 
 static void
-show_grouping_sets(PlanState *planstate, Agg *agg,
+show_grouping_sets(AggState *aggstate, Agg *agg,
 				   List *ancestors, ExplainState *es)
 {
+	PlanState	*planstate = outerPlanState(aggstate);
 	List	   *context;
 	bool		useprefix;
 	ListCell   *lc;
+	int			setno = 0;
 
 	/* Set up deparsing context */
 	context = set_deparse_context_plan(es->deparse_cxt,
@@ -2364,27 +2389,41 @@ show_grouping_sets(PlanState *planstate, Agg *agg,
 
 	ExplainOpenGroup("Grouping Sets", "Grouping Sets", false, es);
 
-	show_grouping_set_keys(planstate, agg, NULL,
-						   context, useprefix, ancestors, es);
+	show_grouping_set_info(aggstate, agg, NULL, context, useprefix, ancestors,
+			aggstate->num_hashes ?
+			&aggstate->perhash[setno++].hashtable->instrument : NULL,
+			es);
 
 	foreach(lc, agg->chain)
 	{
 		Agg		   *aggnode = lfirst(lc);
 		Sort	   *sortnode = (Sort *) aggnode->plan.lefttree;
+		HashTableInstrumentation *inst = NULL;
 
-		show_grouping_set_keys(planstate, aggnode, sortnode,
-							   context, useprefix, ancestors, es);
+		if (aggnode->aggstrategy == AGG_HASHED ||
+				aggnode->aggstrategy == AGG_MIXED)
+		{
+			Assert(setno < aggstate->num_hashes);
+			inst = &aggstate->perhash[setno++].hashtable->instrument;
+		}
+
+		show_grouping_set_info(aggstate, aggnode, sortnode,
+							   context, useprefix, ancestors,
+							   inst, es);
 	}
 
 	ExplainCloseGroup("Grouping Sets", "Grouping Sets", false, es);
 }
 
+/* Show keys and any hash instrumentation for a grouping set */
 static void
-show_grouping_set_keys(PlanState *planstate,
+show_grouping_set_info(AggState *aggstate,
 					   Agg *aggnode, Sort *sortnode,
 					   List *context, bool useprefix,
-					   List *ancestors, ExplainState *es)
+					   List *ancestors, HashTableInstrumentation *inst,
+					   ExplainState *es)
 {
+	PlanState	*planstate = outerPlanState(aggstate);
 	Plan	   *plan = planstate->plan;
 	char	   *exprstr;
 	ListCell   *lc;
@@ -2448,6 +2487,10 @@ show_grouping_set_keys(PlanState *planstate,
 
 	ExplainCloseGroup(keysetname, keysetname, false, es);
 
+	if (aggnode->aggstrategy == AGG_HASHED ||
+			aggnode->aggstrategy == AGG_MIXED)
+		show_tuplehash_info(inst, NULL, es);
+
 	if (sortnode && es->format == EXPLAIN_FORMAT_TEXT)
 		es->indent--;
 
@@ -3059,37 +3102,78 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 }
 
 /*
- * Show information on hash aggregate memory usage and batches.
+ * Show hash bucket stats and (optionally) memory.
  */
 static void
-show_hashagg_info(AggState *aggstate, ExplainState *es)
+show_tuplehash_info(HashTableInstrumentation *inst, AggState *aggstate, ExplainState *es)
 {
-	Agg		*agg	   = (Agg *)aggstate->ss.ps.plan;
-	int64	 memPeakKb = (aggstate->hash_mem_peak + 1023) / 1024;
-
-	Assert(IsA(aggstate, AggState));
-
-	if (agg->aggstrategy != AGG_HASHED &&
-		agg->aggstrategy != AGG_MIXED)
-		return;
+	int64	 spacePeakKb_tuples = (inst->space_peak_tuples + 1023) / 1024,
+		 spacePeakKb_hash = (inst->space_peak_hash + 1023) / 1024;
 
-	if (es->costs && aggstate->hash_planned_partitions > 0)
-	{
+	if (es->costs && aggstate!=NULL && aggstate->hash_planned_partitions > 0)
 		ExplainPropertyInteger("Planned Partitions", NULL,
 							   aggstate->hash_planned_partitions, es);
-	}
 
 	if (!es->analyze)
 		return;
 
-	/* EXPLAIN ANALYZE */
-	ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
-	if (aggstate->hash_batches_used > 0)
+	if (es->format != EXPLAIN_FORMAT_TEXT)
+	{
+		ExplainPropertyInteger("Hash Buckets", NULL,
+							   inst->nbuckets, es);
+		ExplainPropertyInteger("Original Hash Buckets", NULL,
+							   inst->nbuckets_original, es);
+		ExplainPropertyInteger("Peak Memory Usage (hashtable)", "kB",
+							   spacePeakKb_hash, es);
+		ExplainPropertyInteger("Peak Memory Usage (tuples)", "kB",
+							   spacePeakKb_tuples, es);
+		if (aggstate != NULL)
+		{
+			Agg		*agg = (Agg *)aggstate->ss.ps.plan;
+			if (agg->aggstrategy == AGG_HASHED ||
+					agg->aggstrategy == AGG_MIXED)
+			{
+				ExplainPropertyInteger("Disk Usage", "kB",
+									   aggstate->hash_disk_used, es);
+				ExplainPropertyInteger("HashAgg Batches", NULL,
+									   aggstate->hash_batches_used, es);
+			}
+		}
+	}
+	else if (!inst->nbuckets)
+		; /* Do nothing */
+	else
 	{
-		ExplainPropertyInteger("Disk Usage", "kB",
-							   aggstate->hash_disk_used, es);
-		ExplainPropertyInteger("HashAgg Batches", NULL,
-							   aggstate->hash_batches_used, es);
+		if (inst->nbuckets_original != inst->nbuckets)
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str,
+						"Buckets: %lld (originally %lld)",
+						(long long)inst->nbuckets,
+						(long long)inst->nbuckets_original);
+		}
+		else
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str,
+						"Buckets: %lld",
+						(long long)inst->nbuckets);
+		}
+
+		appendStringInfoChar(es->str, '\n');
+		ExplainIndentText(es);
+		appendStringInfo(es->str,
+				"Peak Memory Usage: hashtable: %lldkB, tuples: %lldkB",
+				(long long)spacePeakKb_hash, (long long)spacePeakKb_tuples);
+		appendStringInfoChar(es->str, '\n');
+
+		if (aggstate != NULL && aggstate->hash_batches_used > 0)
+		{
+			ExplainPropertyInteger("Disk Usage", "kB",
+								   aggstate->hash_disk_used, es);
+			ExplainPropertyInteger("HashAgg Batches", NULL,
+								   aggstate->hash_batches_used, es);
+		}
 	}
 }
 
@@ -3798,6 +3882,29 @@ ExplainSubPlans(List *plans, List *ancestors,
 
 		ExplainNode(sps->planstate, ancestors,
 					relationship, sp->plan_name, es);
+		if (sps->hashtable)
+		{
+			ExplainOpenGroup("Hashtable", "Hashtable", true, es);
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				ExplainIndentText(es);
+				appendStringInfoString(es->str, "Hashtable: ");
+			}
+			show_tuplehash_info(&sps->hashtable->instrument, NULL, es);
+			ExplainCloseGroup("Hashtable", "Hashtable", true, es);
+		}
+
+		if (sps->hashnulls)
+		{
+			ExplainOpenGroup("Null Hashtable", "Null Hashtable", true, es);
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				ExplainIndentText(es);
+				appendStringInfoString(es->str, "Null Hashtable: ");
+			}
+			show_tuplehash_info(&sps->hashnulls->instrument, NULL, es);
+			ExplainCloseGroup("Null Hashtable", "Null Hashtable", true, es);
+		}
 
 		ancestors = list_delete_first(ancestors);
 	}
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 009d27b9a8..10276d3f58 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -188,6 +188,7 @@ BuildTupleHashTableExt(PlanState *parent,
 	hashtable->inputslot = NULL;
 	hashtable->in_hash_funcs = NULL;
 	hashtable->cur_eq_func = NULL;
+	memset(&hashtable->instrument, 0, sizeof(hashtable->instrument));
 
 	/*
 	 * If parallelism is in use, even if the master backend is performing the
@@ -203,6 +204,7 @@ BuildTupleHashTableExt(PlanState *parent,
 		hashtable->hash_iv = 0;
 
 	hashtable->hashtab = tuplehash_create(metacxt, nbuckets, hashtable);
+	UpdateTupleHashTableStats(hashtable, true);
 
 	/*
 	 * We copy the input tuple descriptor just for safety --- we assume all
@@ -281,9 +283,40 @@ BuildTupleHashTable(PlanState *parent,
 void
 ResetTupleHashTable(TupleHashTable hashtable)
 {
+	UpdateTupleHashTableStats(hashtable, false);
 	tuplehash_reset(hashtable->hashtab);
 }
 
+/* Update instrumentation stats */
+void
+UpdateTupleHashTableStats(TupleHashTable hashtable, bool initial)
+{
+	hashtable->instrument.nbuckets = hashtable->hashtab->size;
+	if (initial)
+	{
+		hashtable->instrument.nbuckets_original = hashtable->hashtab->size;
+		// hashtable->instrument.space_peak_hash = hashtable->hashtab->size *
+			// sizeof(TupleHashEntryData);
+		hashtable->instrument.space_peak_hash =
+			MemoryContextMemAllocated(hashtable->hashtab->ctx, true);
+		hashtable->instrument.space_peak_tuples = 0;
+	}
+	else
+	{
+		/* hashtable->entrysize includes additionalsize */
+		size_t hash_size = MemoryContextMemAllocated(hashtable->hashtab->ctx, true);
+		size_t tuple_size = MemoryContextMemAllocated(hashtable->tablecxt, true);
+
+		hashtable->instrument.space_peak_hash = Max(
+			hashtable->instrument.space_peak_hash,
+			hash_size);
+
+		hashtable->instrument.space_peak_tuples = Max(
+			hashtable->instrument.space_peak_tuples, tuple_size);
+				// hashtable->hashtab->members * hashtable->entrysize);
+	}
+}
+
 /*
  * Find or create a hashtable entry for the tuple group containing the
  * given tuple.  The tuple must be the same type as the hashtable entries.
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 48b0274b2e..2d6783843a 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1867,36 +1867,25 @@ hash_agg_enter_spill_mode(AggState *aggstate)
 static void
 hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 {
-	Size	meta_mem = 0;
 	Size	hash_mem = 0;
 	Size	buffer_mem;
-	Size	total_mem;
 
 	if (aggstate->aggstrategy != AGG_MIXED &&
 		aggstate->aggstrategy != AGG_HASHED)
 		return;
 
-
 	for (int i = 0; i < aggstate->num_hashes; ++i)
 	{
-		/* memory for the hash table itself */
-		meta_mem += MemoryContextMemAllocated(
-			aggstate->perhash[i].hash_metacxt, true);
-		/* memory for the group keys and transition states */
 		hash_mem += MemoryContextMemAllocated(
 			aggstate->perhash[i].hashcontext->ecxt_per_tuple_memory, true);
+		UpdateTupleHashTableStats(aggstate->perhash[i].hashtable, false);
 	}
 
-	/* memory for read/write tape buffers, if spilled */
+	/* memory for read/write tape buffers, if spilled XXX */
 	buffer_mem = npartitions * HASHAGG_WRITE_BUFFER_SIZE;
 	if (from_tape)
 		buffer_mem += HASHAGG_READ_BUFFER_SIZE;
 
-	/* update peak mem */
-	total_mem = meta_mem + hash_mem + buffer_mem;
-	if (total_mem > aggstate->hash_mem_peak)
-		aggstate->hash_mem_peak = total_mem;
-
 	/* update disk usage */
 	if (aggstate->hash_tapeinfo != NULL)
 	{
@@ -3269,7 +3258,7 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	/*
 	 * Create expression contexts.  We need three or more, one for
 	 * per-input-tuple processing, one for per-output-tuple processing, one
-	 * for all the hashtables, and one for each grouping set.  The per-tuple
+	 * for each hashtable, and one for each grouping set.  The per-tuple
 	 * memory context of the per-grouping-set ExprContexts (aggcontexts)
 	 * replaces the standalone memory context formerly used to hold transition
 	 * values.  We cheat a little by using ExecAssignExprContext() to build
diff --git a/src/backend/executor/nodeRecursiveunion.c b/src/backend/executor/nodeRecursiveunion.c
index 620414a1ed..93272c28b1 100644
--- a/src/backend/executor/nodeRecursiveunion.c
+++ b/src/backend/executor/nodeRecursiveunion.c
@@ -156,6 +156,9 @@ ExecRecursiveUnion(PlanState *pstate)
 		return slot;
 	}
 
+	if (node->hashtable)
+		UpdateTupleHashTableStats(node->hashtable, false);
+
 	return NULL;
 }
 
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
index bfd148a41a..9c0e0ab96e 100644
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -415,6 +415,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 
 	setopstate->table_filled = true;
 	/* Initialize to walk the hash table */
+	UpdateTupleHashTableStats(setopstate->hashtable, false);
 	ResetTupleHashIterator(setopstate->hashtable, &setopstate->hashiter);
 }
 
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 298b7757f5..22c32612ba 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -621,6 +621,9 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
 	ExecClearTuple(node->projRight->pi_state.resultslot);
 
 	MemoryContextSwitchTo(oldcontext);
+	UpdateTupleHashTableStats(node->hashtable, false);
+	if (node->hashnulls)
+		UpdateTupleHashTableStats(node->hashnulls, false);
 }
 
 /*
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index c7deeac662..f71cc03ad5 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -150,6 +150,7 @@ extern TupleHashEntry FindTupleHashEntry(TupleHashTable hashtable,
 										 ExprState *eqcomp,
 										 FmgrInfo *hashfunctions);
 extern void ResetTupleHashTable(TupleHashTable hashtable);
+extern void UpdateTupleHashTableStats(TupleHashTable hashtable, bool initial);
 
 /*
  * prototypes from functions in execJunk.c
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 3fc5989bf7..cdcd825c1e 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -693,6 +693,14 @@ typedef struct TupleHashEntryData
 #define SH_DECLARE
 #include "lib/simplehash.h"
 
+typedef struct HashTableInstrumentation
+{
+	size_t	nbuckets;				/* number of buckets at end of execution */
+	size_t	nbuckets_original;		/* planned number of buckets */
+	size_t	space_peak_hash;		/* peak memory usage in bytes */
+	size_t	space_peak_tuples;		/* peak memory usage in bytes */
+} HashTableInstrumentation;
+
 typedef struct TupleHashTableData
 {
 	tuplehash_hash *hashtab;	/* underlying hash table */
@@ -711,6 +719,7 @@ typedef struct TupleHashTableData
 	ExprState  *cur_eq_func;	/* comparator for input vs. table */
 	uint32		hash_iv;		/* hash-function IV */
 	ExprContext *exprcontext;	/* expression context */
+	HashTableInstrumentation instrument;
 }			TupleHashTableData;
 
 typedef tuplehash_iterator TupleHashIterator;
@@ -2173,9 +2182,9 @@ typedef struct AggState
 	int			hash_planned_partitions; /* number of partitions planned
 											for first pass */
 	double		hashentrysize;	/* estimate revised during execution */
-	Size		hash_mem_peak;	/* peak hash table memory usage */
 	uint64		hash_ngroups_current;	/* number of groups currently in
 										   memory in all hash tables */
+// Move these to instrumentation ?
 	uint64		hash_disk_used; /* kB of disk space used */
 	int			hash_batches_used;	/* batches used during entire execution */
 
-- 
2.17.0

