From a712705ca2cb2f37dc776a9bb44a98130fd0f1b7 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Thu, 2 May 2024 15:21:43 +0200
Subject: [PATCH v20240624 5/7] Collect and print compression stats

Allows evaluating the benefits of compressing the TID lists.
---
 src/backend/access/gin/gininsert.c | 42 +++++++++++++++++++++++-------
 src/include/access/gin.h           |  2 ++
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 5b75e04d951..bb993dfdf80 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -190,7 +190,8 @@ static void _gin_parallel_scan_and_build(GinBuildState *buildstate,
 static ItemPointer _gin_parse_tuple_items(GinTuple *a);
 static Datum _gin_parse_tuple_key(GinTuple *a);
 
-static GinTuple *_gin_build_tuple(OffsetNumber attrnum, unsigned char category,
+static GinTuple *_gin_build_tuple(GinBuildState *state,
+								  OffsetNumber attrnum, unsigned char category,
 								  Datum key, int16 typlen, bool typbyval,
 								  ItemPointerData *items, uint32 nitems,
 								  Size *len);
@@ -553,7 +554,7 @@ ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values,
 			/* there could be many entries, so be willing to abort here */
 			CHECK_FOR_INTERRUPTS();
 
-			tup = _gin_build_tuple(attnum, category,
+			tup = _gin_build_tuple(buildstate, attnum, category,
 								   key, attr->attlen, attr->attbyval,
 								   list, nlist, &tuplen);
 
@@ -1198,9 +1199,9 @@ AssertCheckGinBuffer(GinBuffer *buffer)
 	/*
 	 * we don't know if the TID array is expected to be sorted or not
 	 *
-	 * XXX maybe we can pass that to AssertCheckGinBuffer() call?
-	 * XXX actually with the mergesort in GinBufferStoreTuple, we
-	 * should not need 'false' here. See AssertCheckItemPointers.
+	 * XXX maybe we can pass that to AssertCheckGinBuffer() call? XXX actually
+	 * with the mergesort in GinBufferStoreTuple, we should not need 'false'
+	 * here. See AssertCheckItemPointers.
 	 */
 	AssertCheckItemPointers(buffer, false);
 #endif
@@ -1614,6 +1615,15 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 	/* sort the raw per-worker data */
 	tuplesort_performsort(state->bs_worker_sort);
 
+	/* print some basic info */
+	elog(LOG, "_gin_parallel_scan_and_build raw %zu compressed %zu ratio %.2f%%",
+		 state->buildStats.sizeRaw, state->buildStats.sizeCompressed,
+		 (100.0 * state->buildStats.sizeCompressed) / state->buildStats.sizeRaw);
+
+	/* reset before the second phase */
+	state->buildStats.sizeCompressed = 0;
+	state->buildStats.sizeRaw = 0;
+
 	/*
 	 * Read the GIN tuples from the shared tuplesort, sorted by the key, and
 	 * merge them into larger chunks for the leader to combine.
@@ -1640,7 +1650,7 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 			 */
 			AssertCheckItemPointers(buffer, true);
 
-			ntup = _gin_build_tuple(buffer->attnum, buffer->category,
+			ntup = _gin_build_tuple(state, buffer->attnum, buffer->category,
 									buffer->key, buffer->typlen, buffer->typbyval,
 									buffer->items, buffer->nitems, &ntuplen);
 
@@ -1667,7 +1677,7 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 
 		AssertCheckItemPointers(buffer, true);
 
-		ntup = _gin_build_tuple(buffer->attnum, buffer->category,
+		ntup = _gin_build_tuple(state, buffer->attnum, buffer->category,
 								buffer->key, buffer->typlen, buffer->typbyval,
 								buffer->items, buffer->nitems, &ntuplen);
 
@@ -1682,6 +1692,11 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 	/* relase all the memory */
 	GinBufferFree(buffer);
 
+	/* print some basic info */
+	elog(LOG, "_gin_process_worker_data raw %zu compressed %zu ratio %.2f%%",
+		 state->buildStats.sizeRaw, state->buildStats.sizeCompressed,
+		 (100.0 * state->buildStats.sizeCompressed) / state->buildStats.sizeRaw);
+
 	tuplesort_end(worker_sort);
 }
 
@@ -1754,7 +1769,7 @@ _gin_parallel_scan_and_build(GinBuildState *state,
 			/* there could be many entries, so be willing to abort here */
 			CHECK_FOR_INTERRUPTS();
 
-			tup = _gin_build_tuple(attnum, category,
+			tup = _gin_build_tuple(state, attnum, category,
 								   key, attr->attlen, attr->attbyval,
 								   list, nlist, &len);
 
@@ -1848,6 +1863,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	/* initialize the GIN build state */
 	initGinState(&buildstate.ginstate, indexRel);
 	buildstate.indtuples = 0;
+	/* XXX Shouldn't this initialize the other fields too, like ginbuild()? */
 	memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
 
 	/*
@@ -1925,7 +1941,8 @@ typedef struct
  * of that into the GIN tuple.
  */
 static GinTuple *
-_gin_build_tuple(OffsetNumber attrnum, unsigned char category,
+_gin_build_tuple(GinBuildState *state,
+				 OffsetNumber attrnum, unsigned char category,
 				 Datum key, int16 typlen, bool typbyval,
 				 ItemPointerData *items, uint32 nitems,
 				 Size *len)
@@ -2059,6 +2076,13 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
 		pfree(seginfo);
 	}
 
+	/* how large would the tuple be without compression? */
+	state->buildStats.sizeRaw += MAXALIGN(offsetof(GinTuple, data) + keylen) +
+		nitems * sizeof(ItemPointerData);
+
+	/* compressed size */
+	state->buildStats.sizeCompressed += tuplen;
+
 	return tuple;
 }
 
diff --git a/src/include/access/gin.h b/src/include/access/gin.h
index be76d8446f4..2b6633d068a 100644
--- a/src/include/access/gin.h
+++ b/src/include/access/gin.h
@@ -49,6 +49,8 @@ typedef struct GinStatsData
 	BlockNumber nDataPages;
 	int64		nEntries;
 	int32		ginVersion;
+	Size		sizeRaw;
+	Size		sizeCompressed;
 } GinStatsData;
 
 /*
-- 
2.45.2

