optimize hashjoin

Started by bucooover 1 year ago10 messages

bucoo

bucoo@sohu.com

over 1 year ago

1 attachment(s)

Avoid unnecessary form and deform tuple.

In the TPCH test, HashJoin speed up to ~2x.

Attachments:

optimize-hashjoin.patchtext/plain; name=optimize-hashjoin.patchDownload

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 61480733a1..2dad0c8a55 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -1627,6 +1627,23 @@ ExecHashTableInsert(HashJoinTable hashtable,
 {
 	bool		shouldFree;
 	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+
+	ExecHashTableInsertTuple(hashtable, tuple, hashvalue);
+
+	if (shouldFree)
+		heap_free_minimal_tuple(tuple);
+}
+
+/*
+ * ExecHashTableInsert
+ *		insert a tuple into the hash table depending on the hash value
+ *		it may just go to a temp file for later batches
+ */
+void
+ExecHashTableInsertTuple(HashJoinTable hashtable,
+						 MinimalTuple tuple,
+						 uint32 hashvalue)
+{
 	int			bucketno;
 	int			batchno;
 
@@ -1701,9 +1718,6 @@ ExecHashTableInsert(HashJoinTable hashtable,
 							  &hashtable->innerBatchFile[batchno],
 							  hashtable);
 	}
-
-	if (shouldFree)
-		heap_free_minimal_tuple(tuple);
 }
 
 /*
@@ -1777,12 +1791,10 @@ retry:
  * tuples that belong in the current batch once growth has been disabled.
  */
 void
-ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
-										TupleTableSlot *slot,
-										uint32 hashvalue)
+ExecParallelHashTableInsertCurrentBatchTuple(HashJoinTable hashtable,
+											 MinimalTuple tuple,
+											 uint32 hashvalue)
 {
-	bool		shouldFree;
-	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
 	HashJoinTuple hashTuple;
 	dsa_pointer shared;
 	int			batchno;
@@ -1798,6 +1810,21 @@ ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
 	HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
 	ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
 							  hashTuple, shared);
+}
+
+/*
+ * like ExecParallelHashTableInsertCurrentBatchTuple,
+ * but this function accept a TupleTableSlot
+ */
+void
+ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
+										TupleTableSlot *slot,
+										uint32 hashvalue)
+{
+	bool		shouldFree;
+	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+
+	ExecParallelHashTableInsertCurrentBatchTuple(hashtable, tuple, hashvalue);
 
 	if (shouldFree)
 		heap_free_minimal_tuple(tuple);
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 5f4073eabd..002098f129 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -194,10 +194,10 @@ static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
 static TupleTableSlot *ExecParallelHashJoinOuterGetTuple(PlanState *outerNode,
 														 HashJoinState *hjstate,
 														 uint32 *hashvalue);
-static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
-												 BufFile *file,
-												 uint32 *hashvalue,
-												 TupleTableSlot *tupleSlot);
+static MinimalTuple ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
+											  BufFile *file,
+											  uint32 *hashvalue,
+											  StringInfo buf);
 static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
 static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate);
 static void ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate);
@@ -831,6 +831,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
 	 */
 	hjstate->hj_HashTable = NULL;
 	hjstate->hj_FirstOuterTupleSlot = NULL;
+	hjstate->hj_outerTupleBuffer = NULL;
 
 	hjstate->hj_CurHashValue = 0;
 	hjstate->hj_CurBucketNo = 0;
@@ -936,6 +937,7 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode,
 	}
 	else if (curbatch < hashtable->nbatch)
 	{
+		MinimalTuple mtup;
 		BufFile    *file = hashtable->outerBatchFile[curbatch];
 
 		/*
@@ -945,12 +947,23 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode,
 		if (file == NULL)
 			return NULL;
 
-		slot = ExecHashJoinGetSavedTuple(hjstate,
+		if (unlikely(hjstate->hj_outerTupleBuffer == NULL))
+		{
+			MemoryContext oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(hjstate));
+			hjstate->hj_outerTupleBuffer = makeStringInfo();
+			MemoryContextSwitchTo(oldcontext);
+		}
+
+		mtup = ExecHashJoinGetSavedTuple(hjstate,
 										 file,
 										 hashvalue,
-										 hjstate->hj_OuterTupleSlot);
-		if (!TupIsNull(slot))
+										 hjstate->hj_outerTupleBuffer);
+		if (likely(mtup != NULL))
+		{
+			slot = hjstate->hj_OuterTupleSlot;
+			ExecForceStoreMinimalTuple(mtup, slot, false);
 			return slot;
+		}
 	}
 
 	/* End of this batch */
@@ -1034,7 +1047,6 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
 	int			nbatch;
 	int			curbatch;
 	BufFile    *innerFile;
-	TupleTableSlot *slot;
 	uint32		hashvalue;
 
 	nbatch = hashtable->nbatch;
@@ -1125,21 +1137,25 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
 
 	if (innerFile != NULL)
 	{
+		StringInfoData buf;
+		MinimalTuple tuple;
+
 		if (BufFileSeek(innerFile, 0, 0, SEEK_SET))
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not rewind hash-join temporary file")));
 
-		while ((slot = ExecHashJoinGetSavedTuple(hjstate,
-												 innerFile,
-												 &hashvalue,
-												 hjstate->hj_HashTupleSlot)))
+		initStringInfo(&buf);
+		while ((tuple = ExecHashJoinGetSavedTuple(hjstate,
+												  innerFile,
+												  &hashvalue,
+												  &buf)))
 		{
 			/*
 			 * NOTE: some tuples may be sent to future batches.  Also, it is
 			 * possible for hashtable->nbatch to be increased here!
 			 */
-			ExecHashTableInsert(hashtable, slot, hashvalue);
+			ExecHashTableInsertTuple(hashtable, tuple, hashvalue);
 		}
 
 		/*
@@ -1148,6 +1164,7 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
 		 */
 		BufFileClose(innerFile);
 		hashtable->innerBatchFile[curbatch] = NULL;
+		pfree(buf.data);
 	}
 
 	/*
@@ -1198,7 +1215,6 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
 	{
 		uint32		hashvalue;
 		MinimalTuple tuple;
-		TupleTableSlot *slot;
 
 		if (!hashtable->batches[batchno].done)
 		{
@@ -1230,12 +1246,9 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
 					while ((tuple = sts_parallel_scan_next(inner_tuples,
 														   &hashvalue)))
 					{
-						ExecForceStoreMinimalTuple(tuple,
-												   hjstate->hj_HashTupleSlot,
-												   false);
-						slot = hjstate->hj_HashTupleSlot;
-						ExecParallelHashTableInsertCurrentBatch(hashtable, slot,
-																hashvalue);
+						ExecParallelHashTableInsertCurrentBatchTuple(hashtable,
+																	 tuple,
+																	 hashvalue);
 					}
 					sts_end_parallel_scan(inner_tuples);
 					BarrierArriveAndWait(batch_barrier,
@@ -1349,14 +1362,14 @@ ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
  * ExecHashJoinGetSavedTuple
  *		read the next tuple from a batch file.  Return NULL if no more.
  *
- * On success, *hashvalue is set to the tuple's hash value, and the tuple
- * itself is stored in the given slot.
+ * On success, *hashvalue is set to the tuple's hash value, and return
+ * the tuple(stored in the given buf) itself.
  */
-static TupleTableSlot *
+static MinimalTuple
 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 						  BufFile *file,
 						  uint32 *hashvalue,
-						  TupleTableSlot *tupleSlot)
+						  StringInfo buf)
 {
 	uint32		header[2];
 	size_t		nread;
@@ -1375,19 +1388,19 @@ ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 	 * cheating.
 	 */
 	nread = BufFileReadMaybeEOF(file, header, sizeof(header), true);
-	if (nread == 0)				/* end of file */
-	{
-		ExecClearTuple(tupleSlot);
+	if (unlikely(nread == 0))				/* end of file */
 		return NULL;
-	}
+
+	enlargeStringInfo(buf, header[1]);
 	*hashvalue = header[0];
-	tuple = (MinimalTuple) palloc(header[1]);
+	buf->len = header[1];
+	tuple = (MinimalTuple) buf->data;
 	tuple->t_len = header[1];
 	BufFileReadExact(file,
 					 (char *) tuple + sizeof(uint32),
 					 header[1] - sizeof(uint32));
-	ExecForceStoreMinimalTuple(tuple, tupleSlot, true);
-	return tupleSlot;
+
+	return tuple;
 }
 
 
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index a95911c2fe..543f64cdf7 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -37,12 +37,18 @@ extern void ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable,
 extern void ExecHashTableInsert(HashJoinTable hashtable,
 								TupleTableSlot *slot,
 								uint32 hashvalue);
+extern void ExecHashTableInsertTuple(HashJoinTable hashtable,
+									 MinimalTuple tuple,
+									 uint32 hashvalue);
 extern void ExecParallelHashTableInsert(HashJoinTable hashtable,
 										TupleTableSlot *slot,
 										uint32 hashvalue);
 extern void ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
 													TupleTableSlot *slot,
 													uint32 hashvalue);
+extern void ExecParallelHashTableInsertCurrentBatchTuple(HashJoinTable hashtable,
+														 MinimalTuple tuple,
+														 uint32 hashvalue);
 extern bool ExecHashGetHashValue(HashJoinTable hashtable,
 								 ExprContext *econtext,
 								 List *hashkeys,
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b62c96f206..e5e3a0a155 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2200,6 +2200,7 @@ typedef struct HashJoinState
 	TupleTableSlot *hj_NullOuterTupleSlot;
 	TupleTableSlot *hj_NullInnerTupleSlot;
 	TupleTableSlot *hj_FirstOuterTupleSlot;
+	StringInfo	hj_outerTupleBuffer;
 	int			hj_JoinState;
 	bool		hj_MatchedOuter;
 	bool		hj_OuterNotEmpty;

Tomas Vondra

tomas@vondra.me

over 1 year ago

In reply to: bucoo (#1)

1 attachment(s)

Re: optimize hashjoin

Hi,

On 8/21/24 03:49, bucoo wrote:

Avoid unnecessary form and deform tuple.

Thanks for the patch. Generally speaking, it's usually a good idea to
briefly explain what the patch does, why is that beneficial, in what
circumstances, etc. It's sometimes not quite obvious from the patch
itself (even if the patch is simple). Plus it really helps new
contributors who want to review the patch, but even for experienced
people it's a huge time saver ...

Anyway, I took a look and the basic idea is simple - when shuffling
tuples between batches in a hash join, we're currently deforming the
tuple (->slot) we just read from a batch, only to immediately form it
(slot->) again and write it to the "correct" batch.

I think the idea to skip this unnecessary deform/form step is sound, and
I don't see any particular argument against doing that. The code looks
reasonable too, I think.

A couple minor comments:

0) The patch does not apply anymore, thanks to David committing a patch
yesterday. Attached is a patch rebased on top of current master.

1) Wouldn't it be easier (and just as efficient) to use slots with
TTSOpsMinimalTuple, i.e. backed by a minimal tuple?

2) I find the naming of the new functions a bit confusing. We now have
the "original" functions working with slots, and then also functions
with "Tuple" working with tuples. Seems asymmetric.

3) The code in ExecHashJoinOuterGetTuple is hard to understand, it'd
very much benefit from some comments. I'm a bit unsure if the likely()
and unlikely() hints really help.

4) Is the hj_outerTupleBuffer buffer really needed / effective? I'd bet
just using palloc() will work just as well, thanks to the AllocSet
caching etc.

5) You might want to add the patch to the 2024-09 CF, to keep track of
it: https://commitfest.postgresql.org/49/

In the TPCH test, HashJoin speed up to ~2x.

Can you provide more information about the benchmark you did? What
hardware, what scale, PostgreSQL configuration, which of the 22 queries
are improved, etc.

I ran TPC-H with 1GB and 10GB scales on two machines, and I see pretty
much no difference compared to master. However, it occurred to me the
patch only ever helps if we increase the number of batches during
execution, in which case we need to move tuples to the right batch.

Perhaps that's not happening in my testing, but it was happening in your
runs? Which just means we really need to know more about how you did the
testing.

regards

--
Tomas Vondra

Attachments:

optimize-hashjoin-rebased-tomas.patchtext/x-patch; charset=UTF-8; name=optimize-hashjoin-rebased-tomas.patchDownload

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 570a90ebe15..6f5cd16aa2b 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -1611,6 +1611,23 @@ ExecHashTableInsert(HashJoinTable hashtable,
 {
 	bool		shouldFree;
 	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+
+	ExecHashTableInsertTuple(hashtable, tuple, hashvalue);
+
+	if (shouldFree)
+		heap_free_minimal_tuple(tuple);
+}
+
+/*
+ * ExecHashTableInsert
+ *		insert a tuple into the hash table depending on the hash value
+ *		it may just go to a temp file for later batches
+ */
+void
+ExecHashTableInsertTuple(HashJoinTable hashtable,
+						 MinimalTuple tuple,
+						 uint32 hashvalue)
+{
 	int			bucketno;
 	int			batchno;
 
@@ -1685,9 +1702,6 @@ ExecHashTableInsert(HashJoinTable hashtable,
 							  &hashtable->innerBatchFile[batchno],
 							  hashtable);
 	}
-
-	if (shouldFree)
-		heap_free_minimal_tuple(tuple);
 }
 
 /*
@@ -1761,12 +1775,10 @@ retry:
  * tuples that belong in the current batch once growth has been disabled.
  */
 void
-ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
-										TupleTableSlot *slot,
-										uint32 hashvalue)
+ExecParallelHashTableInsertCurrentBatchTuple(HashJoinTable hashtable,
+											 MinimalTuple tuple,
+											 uint32 hashvalue)
 {
-	bool		shouldFree;
-	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
 	HashJoinTuple hashTuple;
 	dsa_pointer shared;
 	int			batchno;
@@ -1782,6 +1794,21 @@ ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
 	HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
 	ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
 							  hashTuple, shared);
+}
+
+/*
+ * like ExecParallelHashTableInsertCurrentBatchTuple,
+ * but this function accept a TupleTableSlot
+ */
+void
+ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
+										TupleTableSlot *slot,
+										uint32 hashvalue)
+{
+	bool		shouldFree;
+	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+
+	ExecParallelHashTableInsertCurrentBatchTuple(hashtable, tuple, hashvalue);
 
 	if (shouldFree)
 		heap_free_minimal_tuple(tuple);
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 2f7170604d6..d52e4e14366 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -195,10 +195,10 @@ static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
 static TupleTableSlot *ExecParallelHashJoinOuterGetTuple(PlanState *outerNode,
 														 HashJoinState *hjstate,
 														 uint32 *hashvalue);
-static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
-												 BufFile *file,
-												 uint32 *hashvalue,
-												 TupleTableSlot *tupleSlot);
+static MinimalTuple ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
+											  BufFile *file,
+											  uint32 *hashvalue,
+											  StringInfo buf);
 static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
 static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate);
 static void ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate);
@@ -925,6 +925,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
 	 */
 	hjstate->hj_HashTable = NULL;
 	hjstate->hj_FirstOuterTupleSlot = NULL;
+	hjstate->hj_outerTupleBuffer = NULL;
 
 	hjstate->hj_CurHashValue = 0;
 	hjstate->hj_CurBucketNo = 0;
@@ -1030,6 +1031,7 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode,
 	}
 	else if (curbatch < hashtable->nbatch)
 	{
+		MinimalTuple mtup;
 		BufFile    *file = hashtable->outerBatchFile[curbatch];
 
 		/*
@@ -1039,12 +1041,23 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode,
 		if (file == NULL)
 			return NULL;
 
-		slot = ExecHashJoinGetSavedTuple(hjstate,
+		if (unlikely(hjstate->hj_outerTupleBuffer == NULL))
+		{
+			MemoryContext oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(hjstate));
+			hjstate->hj_outerTupleBuffer = makeStringInfo();
+			MemoryContextSwitchTo(oldcontext);
+		}
+
+		mtup = ExecHashJoinGetSavedTuple(hjstate,
 										 file,
 										 hashvalue,
-										 hjstate->hj_OuterTupleSlot);
-		if (!TupIsNull(slot))
+										 hjstate->hj_outerTupleBuffer);
+		if (likely(mtup != NULL))
+		{
+			slot = hjstate->hj_OuterTupleSlot;
+			ExecForceStoreMinimalTuple(mtup, slot, false);
 			return slot;
+		}
 	}
 
 	/* End of this batch */
@@ -1133,7 +1146,6 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
 	int			nbatch;
 	int			curbatch;
 	BufFile    *innerFile;
-	TupleTableSlot *slot;
 	uint32		hashvalue;
 
 	nbatch = hashtable->nbatch;
@@ -1224,21 +1236,25 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
 
 	if (innerFile != NULL)
 	{
+		StringInfoData buf;
+		MinimalTuple tuple;
+
 		if (BufFileSeek(innerFile, 0, 0, SEEK_SET))
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not rewind hash-join temporary file")));
 
-		while ((slot = ExecHashJoinGetSavedTuple(hjstate,
-												 innerFile,
-												 &hashvalue,
-												 hjstate->hj_HashTupleSlot)))
+		initStringInfo(&buf);
+		while ((tuple = ExecHashJoinGetSavedTuple(hjstate,
+												  innerFile,
+												  &hashvalue,
+												  &buf)))
 		{
 			/*
 			 * NOTE: some tuples may be sent to future batches.  Also, it is
 			 * possible for hashtable->nbatch to be increased here!
 			 */
-			ExecHashTableInsert(hashtable, slot, hashvalue);
+			ExecHashTableInsertTuple(hashtable, tuple, hashvalue);
 		}
 
 		/*
@@ -1247,6 +1263,7 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
 		 */
 		BufFileClose(innerFile);
 		hashtable->innerBatchFile[curbatch] = NULL;
+		pfree(buf.data);
 	}
 
 	/*
@@ -1297,7 +1314,6 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
 	{
 		uint32		hashvalue;
 		MinimalTuple tuple;
-		TupleTableSlot *slot;
 
 		if (!hashtable->batches[batchno].done)
 		{
@@ -1329,12 +1345,9 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
 					while ((tuple = sts_parallel_scan_next(inner_tuples,
 														   &hashvalue)))
 					{
-						ExecForceStoreMinimalTuple(tuple,
-												   hjstate->hj_HashTupleSlot,
-												   false);
-						slot = hjstate->hj_HashTupleSlot;
-						ExecParallelHashTableInsertCurrentBatch(hashtable, slot,
-																hashvalue);
+						ExecParallelHashTableInsertCurrentBatchTuple(hashtable,
+																	 tuple,
+																	 hashvalue);
 					}
 					sts_end_parallel_scan(inner_tuples);
 					BarrierArriveAndWait(batch_barrier,
@@ -1448,14 +1461,14 @@ ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
  * ExecHashJoinGetSavedTuple
  *		read the next tuple from a batch file.  Return NULL if no more.
  *
- * On success, *hashvalue is set to the tuple's hash value, and the tuple
- * itself is stored in the given slot.
+ * On success, *hashvalue is set to the tuple's hash value, and return
+ * the tuple(stored in the given buf) itself.
  */
-static TupleTableSlot *
+static MinimalTuple
 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 						  BufFile *file,
 						  uint32 *hashvalue,
-						  TupleTableSlot *tupleSlot)
+						  StringInfo buf)
 {
 	uint32		header[2];
 	size_t		nread;
@@ -1474,19 +1487,19 @@ ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 	 * cheating.
 	 */
 	nread = BufFileReadMaybeEOF(file, header, sizeof(header), true);
-	if (nread == 0)				/* end of file */
-	{
-		ExecClearTuple(tupleSlot);
+	if (unlikely(nread == 0))				/* end of file */
 		return NULL;
-	}
+
+	enlargeStringInfo(buf, header[1]);
 	*hashvalue = header[0];
-	tuple = (MinimalTuple) palloc(header[1]);
+	buf->len = header[1];
+	tuple = (MinimalTuple) buf->data;
 	tuple->t_len = header[1];
 	BufFileReadExact(file,
 					 (char *) tuple + sizeof(uint32),
 					 header[1] - sizeof(uint32));
-	ExecForceStoreMinimalTuple(tuple, tupleSlot, true);
-	return tupleSlot;
+
+	return tuple;
 }
 
 
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index e4eb7bc6359..37585035ade 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -36,12 +36,18 @@ extern void ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable,
 extern void ExecHashTableInsert(HashJoinTable hashtable,
 								TupleTableSlot *slot,
 								uint32 hashvalue);
+extern void ExecHashTableInsertTuple(HashJoinTable hashtable,
+									 MinimalTuple tuple,
+									 uint32 hashvalue);
 extern void ExecParallelHashTableInsert(HashJoinTable hashtable,
 										TupleTableSlot *slot,
 										uint32 hashvalue);
 extern void ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
 													TupleTableSlot *slot,
 													uint32 hashvalue);
+extern void ExecParallelHashTableInsertCurrentBatchTuple(HashJoinTable hashtable,
+														 MinimalTuple tuple,
+														 uint32 hashvalue);
 extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 									  uint32 hashvalue,
 									  int *bucketno,
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index af7d8fd1e72..299f66af135 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2225,6 +2225,7 @@ typedef struct HashJoinState
 	TupleTableSlot *hj_NullOuterTupleSlot;
 	TupleTableSlot *hj_NullInnerTupleSlot;
 	TupleTableSlot *hj_FirstOuterTupleSlot;
+	StringInfo	hj_outerTupleBuffer;
 	int			hj_JoinState;
 	bool		hj_MatchedOuter;
 	bool		hj_OuterNotEmpty;
diff --git a/src/test/modules/unsafe_tests/Makefile b/src/test/modules/unsafe_tests/Makefile
index 90d19791871..a868a6ff34f 100644
--- a/src/test/modules/unsafe_tests/Makefile
+++ b/src/test/modules/unsafe_tests/Makefile
@@ -1,6 +1,6 @@
 # src/test/modules/unsafe_tests/Makefile
 
-REGRESS = rolenames alter_system_table guc_privs
+REGRESS = alter_system_table guc_privs
 
 # the whole point of these tests is to not run installcheck
 NO_INSTALLCHECK = 1

Robert Haas

robertmhaas@gmail.com

over 1 year ago

In reply to: Tomas Vondra (#2)

Re: optimize hashjoin

On Wed, Aug 21, 2024 at 12:31 PM Tomas Vondra <tomas@vondra.me> wrote:

Anyway, I took a look and the basic idea is simple - when shuffling
tuples between batches in a hash join, we're currently deforming the
tuple (->slot) we just read from a batch, only to immediately form it
(slot->) again and write it to the "correct" batch.

Does skipping this cause any problem if some attributes are toasted?

I suppose not, just something to think about.

--
Robert Haas
EDB: http://www.enterprisedb.com

Tomas Vondra

tomas@vondra.me

over 1 year ago

In reply to: Robert Haas (#3)

Re: optimize hashjoin

<div dir='auto'><div><div><div class="elided-text">On Aug 21, 2024 19:17, Robert Haas <robertmhaas@gmail.com> wrote:<br type="attribution"><blockquote style="margin:0 0 0 0.8ex;border-left:1px #ccc solid;padding-left:1ex"><p dir="ltr">On Wed, Aug 21, 2024 at 12:31 PM Tomas Vondra <tomas@vondra.me> wrote:
<br>
> Anyway, I took a look and the basic idea is simple - when shuffling
<br>
> tuples between batches in a hash join, we're currently deforming the
<br>
> tuple (->slot) we just read from a batch, only to immediately form it
<br>
> (slot->) again and write it to the "correct" batch.
<br>

<br>
Does skipping this cause any problem if some attributes are toasted?
<br>

<br>
I suppose not, just something to think about.
<br>

</p></blockquote></div></div></div><div dir="auto">I don't see why would this cause any such problems - if anything has to be done when forming the tuples, it had to be done the first time. Shuffling tuples to a different batch may happen, but AFAIK it's really just a copy.</div><div dir="auto"><br></div><div dir="auto"><div><div dir="auto">--</div><div dir="auto">Tomas Vondra</div></div></div></div>

bucoo

bucoo@sohu.com

over 1 year ago

In reply to: Tomas Vondra (#4)

1 attachment(s)

Re: optimize hashjoin

Attachments:

optimize-hashjoin-master.patchapplication/octet-stream; name=optimize-hashjoin-master.patchDownload

diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 570a90ebe1..5e702a37ff 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -190,7 +190,7 @@ MultiExecPrivateHash(HashState *node)
 			else
 			{
 				/* Not subject to skew optimization, so insert normally */
-				ExecHashTableInsert(hashtable, slot, hashvalue);
+				ExecHashTableInsertSlot(hashtable, slot, hashvalue);
 			}
 			hashtable->totalTuples += 1;
 		}
@@ -1594,7 +1594,7 @@ ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable)
 }
 
 /*
- * ExecHashTableInsert
+ * ExecHashTableInsertSlot
  *		insert a tuple into the hash table depending on the hash value
  *		it may just go to a temp file for later batches
  *
@@ -1605,12 +1605,29 @@ ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable)
  * worth the messiness required.
  */
 void
-ExecHashTableInsert(HashJoinTable hashtable,
-					TupleTableSlot *slot,
-					uint32 hashvalue)
+ExecHashTableInsertSlot(HashJoinTable hashtable,
+						TupleTableSlot *slot,
+						uint32 hashvalue)
 {
 	bool		shouldFree;
 	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+
+	ExecHashTableInsertTuple(hashtable, tuple, hashvalue);
+
+	if (shouldFree)
+		heap_free_minimal_tuple(tuple);
+}
+
+/*
+ * ExecHashTableInsertTuple
+ *		insert a tuple into the hash table depending on the hash value
+ *		it may just go to a temp file for later batches
+ */
+void
+ExecHashTableInsertTuple(HashJoinTable hashtable,
+						 MinimalTuple tuple,
+						 uint32 hashvalue)
+{
 	int			bucketno;
 	int			batchno;
 
@@ -1685,9 +1702,6 @@ ExecHashTableInsert(HashJoinTable hashtable,
 							  &hashtable->innerBatchFile[batchno],
 							  hashtable);
 	}
-
-	if (shouldFree)
-		heap_free_minimal_tuple(tuple);
 }
 
 /*
@@ -1761,12 +1775,10 @@ retry:
  * tuples that belong in the current batch once growth has been disabled.
  */
 void
-ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
-										TupleTableSlot *slot,
-										uint32 hashvalue)
+ExecParallelHashTableInsertTupleCurrentBatch(HashJoinTable hashtable,
+											 MinimalTuple tuple,
+											 uint32 hashvalue)
 {
-	bool		shouldFree;
-	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
 	HashJoinTuple hashTuple;
 	dsa_pointer shared;
 	int			batchno;
@@ -1782,6 +1794,21 @@ ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
 	HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));
 	ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],
 							  hashTuple, shared);
+}
+
+/*
+ * like ExecParallelHashTableInsertTupleCurrentBatch,
+ * but this function accept a TupleTableSlot
+ */
+void
+ExecParallelHashTableInsertSlotCurrentBatch(HashJoinTable hashtable,
+										TupleTableSlot *slot,
+										uint32 hashvalue)
+{
+	bool		shouldFree;
+	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+
+	ExecParallelHashTableInsertTupleCurrentBatch(hashtable, tuple, hashvalue);
 
 	if (shouldFree)
 		heap_free_minimal_tuple(tuple);
@@ -2454,7 +2481,7 @@ ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue)
  *		Insert a tuple into the skew hashtable.
  *
  * This should generally match up with the current-batch case in
- * ExecHashTableInsert.
+ * ExecHashTableInsertSlot.
  */
 static void
 ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -2534,8 +2561,8 @@ ExecHashRemoveNextSkewBucket(HashJoinTable hashtable)
 		Size		tupleSize;
 
 		/*
-		 * This code must agree with ExecHashTableInsert.  We do not use
-		 * ExecHashTableInsert directly as ExecHashTableInsert expects a
+		 * This code must agree with ExecHashTableInsertSlot.  We do not use
+		 * ExecHashTableInsertSlot directly as ExecHashTableInsertSlot expects a
 		 * TupleTableSlot while we already have HashJoinTuples.
 		 */
 		tuple = HJTUPLE_MINTUPLE(hashTuple);
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 2f7170604d..ba5c1d4b49 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -195,10 +195,10 @@ static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
 static TupleTableSlot *ExecParallelHashJoinOuterGetTuple(PlanState *outerNode,
 														 HashJoinState *hjstate,
 														 uint32 *hashvalue);
-static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
-												 BufFile *file,
-												 uint32 *hashvalue,
-												 TupleTableSlot *tupleSlot);
+static MinimalTuple ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
+											  BufFile *file,
+											  uint32 *hashvalue,
+											  StringInfo buf);
 static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
 static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate);
 static void ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate);
@@ -925,6 +925,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
 	 */
 	hjstate->hj_HashTable = NULL;
 	hjstate->hj_FirstOuterTupleSlot = NULL;
+	hjstate->hj_outerTupleBuffer = NULL;
 
 	hjstate->hj_CurHashValue = 0;
 	hjstate->hj_CurBucketNo = 0;
@@ -1030,6 +1031,7 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode,
 	}
 	else if (curbatch < hashtable->nbatch)
 	{
+		MinimalTuple mtup;
 		BufFile    *file = hashtable->outerBatchFile[curbatch];
 
 		/*
@@ -1039,12 +1041,38 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode,
 		if (file == NULL)
 			return NULL;
 
-		slot = ExecHashJoinGetSavedTuple(hjstate,
+		if (unlikely(hjstate->hj_outerTupleBuffer == NULL))
+		{
+			/*
+			 * Avoid realloc memory for MinimalTuple, we alloc a buffer
+			 * to reuse store MinimalTuple. MemoryContext same as hjstate.
+			 */
+			MemoryContext oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(hjstate));
+			hjstate->hj_outerTupleBuffer = makeStringInfo();
+			MemoryContextSwitchTo(oldcontext);
+		}
+
+		mtup = ExecHashJoinGetSavedTuple(hjstate,
 										 file,
 										 hashvalue,
-										 hjstate->hj_OuterTupleSlot);
-		if (!TupIsNull(slot))
+										 hjstate->hj_outerTupleBuffer);
+		if (likely(mtup != NULL))
+		{
+			slot = hjstate->hj_OuterTupleSlot;
+
+			/*
+			 * mtup is hold in hjstate->hj_outerTupleBuffer, so we can using
+			 * shouldFree as false to call ExecForceStoreMinimalTuple().
+			 * 
+			 * When slot is TTSOpsMinimalTuple we can avoid realloc memory for
+			 * new MinimalTuple(reuse StringInfo to call ExecHashJoinGetSavedTuple).
+			 * 
+			 * More importantly, in non-TTSOpsMinimalTuple scenarios, it can avoid
+			 * reform(materialize) tuple(see ExecForceStoreMinimalTuple).
+			 */
+			ExecForceStoreMinimalTuple(mtup, slot, false);
 			return slot;
+		}
 	}
 
 	/* End of this batch */
@@ -1133,7 +1161,6 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
 	int			nbatch;
 	int			curbatch;
 	BufFile    *innerFile;
-	TupleTableSlot *slot;
 	uint32		hashvalue;
 
 	nbatch = hashtable->nbatch;
@@ -1224,21 +1251,25 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
 
 	if (innerFile != NULL)
 	{
+		StringInfoData buf;
+		MinimalTuple tuple;
+
 		if (BufFileSeek(innerFile, 0, 0, SEEK_SET))
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not rewind hash-join temporary file")));
 
-		while ((slot = ExecHashJoinGetSavedTuple(hjstate,
-												 innerFile,
-												 &hashvalue,
-												 hjstate->hj_HashTupleSlot)))
+		initStringInfo(&buf);
+		while ((tuple = ExecHashJoinGetSavedTuple(hjstate,
+												  innerFile,
+												  &hashvalue,
+												  &buf)))
 		{
 			/*
 			 * NOTE: some tuples may be sent to future batches.  Also, it is
 			 * possible for hashtable->nbatch to be increased here!
 			 */
-			ExecHashTableInsert(hashtable, slot, hashvalue);
+			ExecHashTableInsertTuple(hashtable, tuple, hashvalue);
 		}
 
 		/*
@@ -1247,6 +1278,7 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
 		 */
 		BufFileClose(innerFile);
 		hashtable->innerBatchFile[curbatch] = NULL;
+		pfree(buf.data);
 	}
 
 	/*
@@ -1297,7 +1329,6 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
 	{
 		uint32		hashvalue;
 		MinimalTuple tuple;
-		TupleTableSlot *slot;
 
 		if (!hashtable->batches[batchno].done)
 		{
@@ -1329,12 +1360,9 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
 					while ((tuple = sts_parallel_scan_next(inner_tuples,
 														   &hashvalue)))
 					{
-						ExecForceStoreMinimalTuple(tuple,
-												   hjstate->hj_HashTupleSlot,
-												   false);
-						slot = hjstate->hj_HashTupleSlot;
-						ExecParallelHashTableInsertCurrentBatch(hashtable, slot,
-																hashvalue);
+						ExecParallelHashTableInsertTupleCurrentBatch(hashtable,
+																	 tuple,
+																	 hashvalue);
 					}
 					sts_end_parallel_scan(inner_tuples);
 					BarrierArriveAndWait(batch_barrier,
@@ -1448,14 +1476,14 @@ ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
  * ExecHashJoinGetSavedTuple
  *		read the next tuple from a batch file.  Return NULL if no more.
  *
- * On success, *hashvalue is set to the tuple's hash value, and the tuple
- * itself is stored in the given slot.
+ * On success, *hashvalue is set to the tuple's hash value, and return
+ * the tuple(stored in the given buf) itself.
  */
-static TupleTableSlot *
+static MinimalTuple
 ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 						  BufFile *file,
 						  uint32 *hashvalue,
-						  TupleTableSlot *tupleSlot)
+						  StringInfo buf)
 {
 	uint32		header[2];
 	size_t		nread;
@@ -1474,19 +1502,20 @@ ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 	 * cheating.
 	 */
 	nread = BufFileReadMaybeEOF(file, header, sizeof(header), true);
-	if (nread == 0)				/* end of file */
-	{
-		ExecClearTuple(tupleSlot);
+	if (unlikely(nread == 0))				/* end of file */
 		return NULL;
-	}
+
+	buf->len = 0;	/* inline resetStringInfo(buf); */
+	enlargeStringInfo(buf, header[1]);
+	buf->len = header[1];
 	*hashvalue = header[0];
-	tuple = (MinimalTuple) palloc(header[1]);
+	tuple = (MinimalTuple) buf->data;
 	tuple->t_len = header[1];
 	BufFileReadExact(file,
 					 (char *) tuple + sizeof(uint32),
 					 header[1] - sizeof(uint32));
-	ExecForceStoreMinimalTuple(tuple, tupleSlot, true);
-	return tupleSlot;
+
+	return tuple;
 }
 
 
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index e4eb7bc635..6cc2264692 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -33,15 +33,21 @@ extern void ExecHashTableDetachBatch(HashJoinTable hashtable);
 extern void ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable,
 												 int batchno);
 
-extern void ExecHashTableInsert(HashJoinTable hashtable,
-								TupleTableSlot *slot,
-								uint32 hashvalue);
+extern void ExecHashTableInsertSlot(HashJoinTable hashtable,
+									TupleTableSlot *slot,
+									uint32 hashvalue);
+extern void ExecHashTableInsertTuple(HashJoinTable hashtable,
+									 MinimalTuple tuple,
+									 uint32 hashvalue);
 extern void ExecParallelHashTableInsert(HashJoinTable hashtable,
 										TupleTableSlot *slot,
 										uint32 hashvalue);
-extern void ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable,
-													TupleTableSlot *slot,
-													uint32 hashvalue);
+extern void ExecParallelHashTableInsertSlotCurrentBatch(HashJoinTable hashtable,
+														TupleTableSlot *slot,
+														uint32 hashvalue);
+extern void ExecParallelHashTableInsertTupleCurrentBatch(HashJoinTable hashtable,
+														 MinimalTuple tuple,
+														 uint32 hashvalue);
 extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 									  uint32 hashvalue,
 									  int *bucketno,
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index af7d8fd1e7..299f66af13 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2225,6 +2225,7 @@ typedef struct HashJoinState
 	TupleTableSlot *hj_NullOuterTupleSlot;
 	TupleTableSlot *hj_NullInnerTupleSlot;
 	TupleTableSlot *hj_FirstOuterTupleSlot;
+	StringInfo	hj_outerTupleBuffer;
 	int			hj_JoinState;
 	bool		hj_MatchedOuter;
 	bool		hj_OuterNotEmpty;

Import Notes

Resolved by subject fallback

Kirill Reshke

reshkekirill@gmail.com

over 1 year ago

In reply to: bucoo (#5)

Re: optimize hashjoin

On Thu, 22 Aug 2024 at 17:08, bucoo <bucoo@sohu.com> wrote:

0) The patch does not apply anymore, thanks to David committing a patch

yesterday. Attached is a patch rebased on top of current master.

That patch is based on PG17. I have now rewritten it based on the master
branch and added some comments.

1) Wouldn't it be easier (and just as efficient) to use slots with

TTSOpsMinimalTuple, i.e. backed by a minimal tuple?

Use diffent kind of slot, the ExecEvalExpr function will report an error.

2) I find the naming of the new functions a bit confusing. We now have
the "original" functions working with slots, and then also functions
with "Tuple" working with tuples. Seems asymmetric.

In net patch function name renamed to ExecHashTableInsertSlot and
ExecHashTableInsertTuple,

also ExecParallelHashTableInsertSlotCurrentBatch and
ExecParallelHashTableInsertTupleCurrentBatch.

3) The code in ExecHashJoinOuterGetTuple is hard to understand, it'd
very much benefit from some comments. I'm a bit unsure if the likely()
and unlikely() hints really help.

In new patch added some comments.

"Likely" and "unlikely" might offer only marginal help on some CPUs and
might not be beneficial at all on other platforms (I think).

4) Is the hj_outerTupleBuffer buffer really needed / effective? I'd bet
just using palloc() will work just as well, thanks to the AllocSet
caching etc.

The hj_outerTupleBuffer avoid reform(materialize) tuple in
non-TTSOpsMinimalTuple scenarios,

see ExecForceStoreMinimalTuple. I added some comments in new patch.

Can you provide more information about the benchmark you did? What
hardware, what scale, PostgreSQL configuration, which of the 22 queries
are improved, etc.

I ran TPC-H with 1GB and 10GB scales on two machines, and I see pretty
much no difference compared to master. However, it occurred to me the
patch only ever helps if we increase the number of batches during
execution, in which case we need to move tuples to the right batch.

Only parallel HashJoin speed up to ~2x(all data cached in memory),

not full query, include non-parallel HashJoin.

non-parallel HashJoin only when batchs large then one will speed up,

because this patch only optimize for read batchs tuples to memory.

Hi

likely/unlikely usage can be justified via benchmark. Parallel HashJoin
speed up still also can be verified via benchmark. Either benchmark script
or benchmark result, and it is better to provide both.

--
Best regards,
Kirill Reshke

Tomas Vondra

tomas@vondra.me

over 1 year ago

In reply to: bucoo (#5)

Re: optimize hashjoin

Hi,

It seems you responded by writing a new message and just copying the
subject, which unfortunately doesn't set the headers used for threading
(e.g. in archives). Please just respond to the message.

Or maybe your client does not set the References/In-Reply-To headers
correctly. Not sure which mail client you're using.

On 8/22/24 14:08, bucoo wrote:

0) The patch does not apply anymore, thanks to David committing a patch

yesterday. Attached is a patch rebased on top of current master.

That patch is based on PG17. I have now rewritten it based on the master
branch and added some comments.

Thanks. Yeah, patches should be based on "master".

1) Wouldn't it be easier (and just as efficient) to use slots with

TTSOpsMinimalTuple, i.e. backed by a minimal tuple?

Use diffent kind of slot, the ExecEvalExpr function will report an error.

Hmm, I haven't tried so it's possible it wouldn't work.

2) I find the naming of the new functions a bit confusing. We now have
the "original" functions working with slots, and then also functions
with "Tuple" working with tuples. Seems asymmetric.

In net patch function name renamed to ExecHashTableInsertSlot and
ExecHashTableInsertTuple,

also ExecParallelHashTableInsertSlotCurrentBatch and
ExecParallelHashTableInsertTupleCurrentBatch.

3) The code in ExecHashJoinOuterGetTuple is hard to understand, it'd
very much benefit from some comments. I'm a bit unsure if the likely()
and unlikely() hints really help.

In new patch added some comments.

"Likely" and "unlikely" might offer only marginal help on some CPUs and
might not be beneficial at all on other platforms (I think).

Having such hints is an implicit suggestion it's beneficial. Otherwise
we'd just use them everywhere, but we don't - only a tiny fraction of
condition has those.

4) Is the hj_outerTupleBuffer buffer really needed / effective? I'd bet
just using palloc() will work just as well, thanks to the AllocSet
caching etc.

The hj_outerTupleBuffer avoid reform(materialize) tuple in
non-TTSOpsMinimalTuple scenarios,

see ExecForceStoreMinimalTuple. I added some comments in new patch.

AFAIK you mean this comment:

* mtup is hold in hjstate->hj_outerTupleBuffer, so we can using
* shouldFree as false to call ExecForceStoreMinimalTuple().
*
* When slot is TTSOpsMinimalTuple we can avoid realloc memory for
* new MinimalTuple(reuse StringInfo to call ExecHashJoinGetSavedTuple).

But my point was that I don't think the palloc/repalloc should be very
expensive, once the AllocSet warms up a bit.

* More importantly, in non-TTSOpsMinimalTuple scenarios, it can avoid
* reform(materialize) tuple(see ExecForceStoreMinimalTuple).

Yeah, but doesn't that conflate two things - materialization and freeing
the memory? Only because materialization is expensive, is that a good
reason to abandon the memory management too?

Can you provide more information about the benchmark you did? What
hardware, what scale, PostgreSQL configuration, which of the 22 queries
are improved, etc.

I ran TPC-H with 1GB and 10GB scales on two machines, and I see pretty
much no difference compared to master. However, it occurred to me the
patch only ever helps if we increase the number of batches during
execution, in which case we need to move tuples to the right batch.

Only parallel HashJoin speed up to ~2x(all data cached in memory),

not full query, include non-parallel HashJoin.

non-parallel HashJoin only when batchs large then one will speed up,

because this patch only optimize for read batchs tuples to memory.

I'm sorry, but this does not answer *any* of the questions I asked.

Please provide enough info to reproduce the benefit - benchmark scale,
which query, which parameters, etc. Show explain / explain analyze of
the query without / with the patch, stuff like that.

I ran a number of TPC-H benchmarks with the patch and I never a benefit
of this scale.

regards

--
Tomas Vondra

bucoo

bucoo@sohu.com

over 1 year ago

In reply to: Tomas Vondra (#7)

答复: optimize hashjoin

* mtup is hold in hjstate->hj_outerTupleBuffer, so we can using
* shouldFree as false to call ExecForceStoreMinimalTuple().
*
* When slot is TTSOpsMinimalTuple we can avoid realloc memory for
* new MinimalTuple(reuse StringInfo to call ExecHashJoinGetSavedTuple).

But my point was that I don't think the palloc/repalloc should be very expensive, once the AllocSet warms up a bit.

Avoiding memory palloc/repalloc is just a side effect of avoiding reform tuple.

* More importantly, in non-TTSOpsMinimalTuple scenarios, it can avoid
* reform(materialize) tuple(see ExecForceStoreMinimalTuple).

Yeah, but doesn't that conflate two things - materialization and freeing the memory? Only because materialization is expensive, is that a good reason to abandon the memory management too?

Currently, I haven't thought of a better way to avoid reform.

Can you provide more information about the benchmark you did? What
hardware, what scale, PostgreSQL configuration, which of the 22
queries are improved, etc.

I ran TPC-H with 1GB and 10GB scales on two machines, and I see
pretty much no difference compared to master. However, it occurred to
me the patch only ever helps if we increase the number of batches
during execution, in which case we need to move tuples to the right batch.

Only parallel HashJoin speed up to ~2x(all data cached in memory),

not full query, include non-parallel HashJoin.

non-parallel HashJoin only when batchs large then one will speed up,

because this patch only optimize for read batchs tuples to memory.

I'm sorry, but this does not answer *any* of the questions I asked.

Please provide enough info to reproduce the benefit - benchmark scale, which query, which > parameters, etc. Show explain / explain analyze of the query without / with the patch, stuff > like that.

I ran a number of TPC-H benchmarks with the patch and I never a benefit of this scale.

After further testing, it turns out that the parallel hashjoin did not improve performance. I might have compared it with a debug version at the time. I apologize for that.

Howerver, the non-parallel hashjoin indeed showed about a 10% performance improvement.
Here is the testing information:

CPU: 13th Gen Intel(R) Core(TM) i7-13700
Memory: 32GB
SSD: UMIS REPEYJ512MKN1QWQ
Windows version: win11 23H2 22631.4037
WSL version: 2.2.4.0
Kernel version: 5.15.153.1-2
OS version: rocky linux 9.4
TPCH: SF=8

SQL:
set max_parallel_workers_per_gather = 0;
set enable_mergejoin = off;
explain (verbose,analyze)
select count(*)
from lineitem, orders
where lineitem.l_orderkey = orders.o_orderkey;

patch before:
Aggregate (cost=2422401.83..2422401.84 rows=1 width=8) (actual time=10591.679..10591.681 rows=1 loops=1)
Output: count(*)
-> Hash Join (cost=508496.00..2302429.31 rows=47989008 width=0) (actual time=1075.213..9503.727 rows=47989007 loops=1)
Inner Unique: true
Hash Cond: (lineitem.l_orderkey = orders.o_orderkey)
-> Index Only Scan using lineitem_pkey on public.lineitem (cost=0.56..1246171.69 rows=47989008 width=4) (actual time=0.023..1974.365 rows=47989007 loops=1)
Output: lineitem.l_orderkey
Heap Fetches: 0
-> Hash (cost=311620.43..311620.43 rows=12000000 width=4) (actual time=1074.155..1074.156 rows=12000000 loops=1)
Output: orders.o_orderkey
Buckets: 262144 Batches: 128 Memory Usage: 5335kB
-> Index Only Scan using orders_pkey on public.orders (cost=0.43..311620.43 rows=12000000 width=4) (actual time=0.014..464.346 rows=12000000 loops=1)
Output: orders.o_orderkey
Heap Fetches: 0
Planning Time: 0.141 ms
Execution Time: 10591.730 ms
(16 rows)

Patch after:
Aggregate (cost=2422401.83..2422401.84 rows=1 width=8) (actual time=9826.105..9826.106 rows=1 loops=1)
Output: count(*)
-> Hash Join (cost=508496.00..2302429.31 rows=47989008 width=0) (actual time=1087.588..8726.441 rows=47989007 loops=1)
Inner Unique: true
Hash Cond: (lineitem.l_orderkey = orders.o_orderkey)
-> Index Only Scan using lineitem_pkey on public.lineitem (cost=0.56..1246171.69 rows=47989008 width=4) (actual time=0.015..1989.389 rows=47989007 loops=1)
Output: lineitem.l_orderkey
Heap Fetches: 0
-> Hash (cost=311620.43..311620.43 rows=12000000 width=4) (actual time=1086.357..1086.358 rows=12000000 loops=1)
Output: orders.o_orderkey
Buckets: 262144 Batches: 128 Memory Usage: 5335kB
-> Index Only Scan using orders_pkey on public.orders (cost=0.43..311620.43 rows=12000000 width=4) (actual time=0.011..470.225 rows=12000000 loops=1)
Output: orders.o_orderkey
Heap Fetches: 0
Planning Time: 0.065 ms
Execution Time: 9826.135 ms

Robert Haas

robertmhaas@gmail.com

over 1 year ago

In reply to: bucoo (#8)

Re: optimize hashjoin

On Fri, Aug 23, 2024 at 7:02 AM bucoo <bucoo@sohu.com> wrote:

Howerver, the non-parallel hashjoin indeed showed about a 10% performance improvement.
-> Hash Join (cost=508496.00..2302429.31 rows=47989008 width=0) (actual time=1075.213..9503.727 rows=47989007 loops=1)
-> Hash Join (cost=508496.00..2302429.31 rows=47989008 width=0) (actual time=1087.588..8726.441 rows=47989007 loops=1)

It's not a good idea to test performance with EXPLAIN ANALYZE,
generally speaking. And you usually need to test a few times and
average or something, rather than just a single test. But also, this
doesn't show the hash join being 10% faster. It shows the hash join
being essentially the same speed (1075ms unpatched, 1087ms patched),
and the aggregate node on top of it being faster.

Now, it does seem possible to me that changing one node could cause a
performance improvement for the node above it, but I don't quite see
why that would happen in this case.

--
Robert Haas
EDB: http://www.enterprisedb.com

#10

Bruce Momjian

bruce@momjian.us

about 1 year ago

In reply to: Robert Haas (#9)

Re: optimize hashjoin

On Fri, Aug 23, 2024 at 08:17:26AM -0400, Robert Haas wrote:

On Fri, Aug 23, 2024 at 7:02 AM bucoo <bucoo@sohu.com> wrote:

Howerver, the non-parallel hashjoin indeed showed about a 10% performance improvement.
-> Hash Join (cost=508496.00..2302429.31 rows=47989008 width=0) (actual time=1075.213..9503.727 rows=47989007 loops=1)
-> Hash Join (cost=508496.00..2302429.31 rows=47989008 width=0) (actual time=1087.588..8726.441 rows=47989007 loops=1)

It's not a good idea to test performance with EXPLAIN ANALYZE,
generally speaking. And you usually need to test a few times and
average or something, rather than just a single test. But also, this
doesn't show the hash join being 10% faster. It shows the hash join
being essentially the same speed (1075ms unpatched, 1087ms patched),
and the aggregate node on top of it being faster.

Now, it does seem possible to me that changing one node could cause a
performance improvement for the node above it, but I don't quite see
why that would happen in this case.

Where are we on this patch?

--
Bruce Momjian <bruce@momjian.us> https://momjian.us
EDB https://enterprisedb.com

When a patient asks the doctor, "Am I going to die?", he means
"Am I going to die soon?"