From 20d56eb519d5785160b333bbbd1bb9d909529a5c Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Sun, 5 Jan 2025 21:24:23 +0100
Subject: [PATCH vadjust-size 1/2] hashjoin sizing balance

---
 src/backend/executor/nodeHash.c | 113 +++++++++++++++++++++++++++++++-
 1 file changed, 111 insertions(+), 2 deletions(-)

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 6f8a379e3b9..271ffeb2228 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -848,6 +848,37 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		nbatch = pg_nextpower2_32(Max(2, minbatch));
 	}
 
+	/*
+	 * Optimize the total amount of memory consumed by the hash, including
+	 * the buffer files. It may be more efficient to use fewer large batches.
+	 *
+	 * Try to move on the anti-diagonal and see if we'd consume less memory.
+	 * If yes, do that, and also adjust the bucket counts. We only move in
+	 * the direction of fewer batches, because that part of the "matrix" has
+	 * the most issues with memory consumption.
+	 *
+	 * We might try this for the other direction too, but only as long as
+	 * the memory consumption is more than the work_mem value. It can only
+	 * be less than 2x worse than the ideal memory allowance, and after one
+	 * step we'll get on the "boundary" I think.
+	 */
+	while (nbatch > 0)
+	{
+		/* how much memory would we use with half the batches? */
+		size_t	space = hash_table_bytes * 2 + (nbatch * BLCKSZ);
+		size_t	current = hash_table_bytes + (2 * nbatch * BLCKSZ);
+
+		/* we're already optimal */
+		if (current < space)
+			break;
+
+		/* better to use fewer larger batches */
+		*space_allowed = *space_allowed * 2;
+
+		nbatch /= 2;
+		nbuckets *= 2;
+	}
+
 	Assert(nbuckets > 0);
 	Assert(nbatch > 0);
 
@@ -943,6 +974,29 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 
 	hashtable->nbatch = nbatch;
 
+	/*
+	 * Consider adjusting the allowed batch size, depending on the number of
+	 * batches, to minimize the overall amount of memory consumed - for both
+	 * the hash table and batch files.
+	 *
+	 * XXX If we increase the batch size, we should also adjust the optimal
+	 * number of buckets. But we can't do that, because right now we split
+	 * the hash into batch/bucket like this [...|batch|bucket] so adding
+	 * bits to bucket changes the batch in a way that breaks the spilling
+	 * (the batchno could move "backwards, but we assume that can't happen).
+	 */
+	{
+		size_t	optimalSpace = (nbatch * 2 * BLCKSZ);
+
+		while (hashtable->spaceAllowed < optimalSpace)
+		{
+			hashtable->spaceAllowed *= 2;
+
+			hashtable->nbuckets_optimal *= 2;
+			hashtable->log2_nbuckets_optimal++;
+		}
+	}
+
 	/*
 	 * Scan through the existing hash table entries and dump out any that are
 	 * no longer of the current batch.
@@ -1793,6 +1847,62 @@ ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
 		heap_free_minimal_tuple(tuple);
 }
 
+static inline unsigned char
+reverse_byte(unsigned char x)
+{
+	static const unsigned char table[] = {
+		0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
+		0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
+		0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
+		0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
+		0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
+		0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
+		0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
+		0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
+		0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
+		0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
+		0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
+		0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
+		0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
+		0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
+		0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
+		0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
+		0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
+		0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
+		0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
+		0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
+		0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
+		0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
+		0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
+		0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
+		0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
+		0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
+		0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
+		0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
+		0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
+		0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
+		0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
+		0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
+	};
+
+	return table[x];
+}
+
+static inline uint32
+reverse_uint32(uint32 x)
+{
+	uint32	ret;
+
+	unsigned char *src = (unsigned char *) &x;
+	unsigned char *dst = (unsigned char *) &ret;
+
+	dst[0] = reverse_byte(src[3]);
+	dst[1] = reverse_byte(src[2]);
+	dst[2] = reverse_byte(src[1]);
+	dst[3] = reverse_byte(src[0]);
+
+	return ret;
+}
 
 /*
  * ExecHashGetBucketAndBatch
@@ -1833,8 +1943,7 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 	if (nbatch > 1)
 	{
 		*bucketno = hashvalue & (nbuckets - 1);
-		*batchno = pg_rotate_right32(hashvalue,
-									 hashtable->log2_nbuckets) & (nbatch - 1);
+		*batchno = reverse_uint32(hashvalue) & (nbatch - 1);
 	}
 	else
 	{
-- 
2.47.1