diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 570a90ebe1..5e702a37ff 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -190,7 +190,7 @@ MultiExecPrivateHash(HashState *node) else { /* Not subject to skew optimization, so insert normally */ - ExecHashTableInsert(hashtable, slot, hashvalue); + ExecHashTableInsertSlot(hashtable, slot, hashvalue); } hashtable->totalTuples += 1; } @@ -1594,7 +1594,7 @@ ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable) } /* - * ExecHashTableInsert + * ExecHashTableInsertSlot * insert a tuple into the hash table depending on the hash value * it may just go to a temp file for later batches * @@ -1605,12 +1605,29 @@ ExecParallelHashIncreaseNumBuckets(HashJoinTable hashtable) * worth the messiness required. */ void -ExecHashTableInsert(HashJoinTable hashtable, - TupleTableSlot *slot, - uint32 hashvalue) +ExecHashTableInsertSlot(HashJoinTable hashtable, + TupleTableSlot *slot, + uint32 hashvalue) { bool shouldFree; MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree); + + ExecHashTableInsertTuple(hashtable, tuple, hashvalue); + + if (shouldFree) + heap_free_minimal_tuple(tuple); +} + +/* + * ExecHashTableInsertTuple + * insert a tuple into the hash table depending on the hash value + * it may just go to a temp file for later batches + */ +void +ExecHashTableInsertTuple(HashJoinTable hashtable, + MinimalTuple tuple, + uint32 hashvalue) +{ int bucketno; int batchno; @@ -1685,9 +1702,6 @@ ExecHashTableInsert(HashJoinTable hashtable, &hashtable->innerBatchFile[batchno], hashtable); } - - if (shouldFree) - heap_free_minimal_tuple(tuple); } /* @@ -1761,12 +1775,10 @@ retry: * tuples that belong in the current batch once growth has been disabled. */ void -ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable, - TupleTableSlot *slot, - uint32 hashvalue) +ExecParallelHashTableInsertTupleCurrentBatch(HashJoinTable hashtable, + MinimalTuple tuple, + uint32 hashvalue) { - bool shouldFree; - MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree); HashJoinTuple hashTuple; dsa_pointer shared; int batchno; @@ -1782,6 +1794,21 @@ ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable, HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple)); ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno], hashTuple, shared); +} + +/* + * like ExecParallelHashTableInsertTupleCurrentBatch, + * but this function accept a TupleTableSlot + */ +void +ExecParallelHashTableInsertSlotCurrentBatch(HashJoinTable hashtable, + TupleTableSlot *slot, + uint32 hashvalue) +{ + bool shouldFree; + MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree); + + ExecParallelHashTableInsertTupleCurrentBatch(hashtable, tuple, hashvalue); if (shouldFree) heap_free_minimal_tuple(tuple); @@ -2454,7 +2481,7 @@ ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue) * Insert a tuple into the skew hashtable. * * This should generally match up with the current-batch case in - * ExecHashTableInsert. + * ExecHashTableInsertSlot. */ static void ExecHashSkewTableInsert(HashJoinTable hashtable, @@ -2534,8 +2561,8 @@ ExecHashRemoveNextSkewBucket(HashJoinTable hashtable) Size tupleSize; /* - * This code must agree with ExecHashTableInsert. We do not use - * ExecHashTableInsert directly as ExecHashTableInsert expects a + * This code must agree with ExecHashTableInsertSlot. We do not use + * ExecHashTableInsertSlot directly as ExecHashTableInsertSlot expects a * TupleTableSlot while we already have HashJoinTuples. */ tuple = HJTUPLE_MINTUPLE(hashTuple); diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c index 2f7170604d..ba5c1d4b49 100644 --- a/src/backend/executor/nodeHashjoin.c +++ b/src/backend/executor/nodeHashjoin.c @@ -195,10 +195,10 @@ static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode, static TupleTableSlot *ExecParallelHashJoinOuterGetTuple(PlanState *outerNode, HashJoinState *hjstate, uint32 *hashvalue); -static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate, - BufFile *file, - uint32 *hashvalue, - TupleTableSlot *tupleSlot); +static MinimalTuple ExecHashJoinGetSavedTuple(HashJoinState *hjstate, + BufFile *file, + uint32 *hashvalue, + StringInfo buf); static bool ExecHashJoinNewBatch(HashJoinState *hjstate); static bool ExecParallelHashJoinNewBatch(HashJoinState *hjstate); static void ExecParallelHashJoinPartitionOuter(HashJoinState *hjstate); @@ -925,6 +925,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags) */ hjstate->hj_HashTable = NULL; hjstate->hj_FirstOuterTupleSlot = NULL; + hjstate->hj_outerTupleBuffer = NULL; hjstate->hj_CurHashValue = 0; hjstate->hj_CurBucketNo = 0; @@ -1030,6 +1031,7 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode, } else if (curbatch < hashtable->nbatch) { + MinimalTuple mtup; BufFile *file = hashtable->outerBatchFile[curbatch]; /* @@ -1039,12 +1041,38 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode, if (file == NULL) return NULL; - slot = ExecHashJoinGetSavedTuple(hjstate, + if (unlikely(hjstate->hj_outerTupleBuffer == NULL)) + { + /* + * Avoid realloc memory for MinimalTuple, we alloc a buffer + * to reuse store MinimalTuple. MemoryContext same as hjstate. + */ + MemoryContext oldcontext = MemoryContextSwitchTo(GetMemoryChunkContext(hjstate)); + hjstate->hj_outerTupleBuffer = makeStringInfo(); + MemoryContextSwitchTo(oldcontext); + } + + mtup = ExecHashJoinGetSavedTuple(hjstate, file, hashvalue, - hjstate->hj_OuterTupleSlot); - if (!TupIsNull(slot)) + hjstate->hj_outerTupleBuffer); + if (likely(mtup != NULL)) + { + slot = hjstate->hj_OuterTupleSlot; + + /* + * mtup is hold in hjstate->hj_outerTupleBuffer, so we can using + * shouldFree as false to call ExecForceStoreMinimalTuple(). + * + * When slot is TTSOpsMinimalTuple we can avoid realloc memory for + * new MinimalTuple(reuse StringInfo to call ExecHashJoinGetSavedTuple). + * + * More importantly, in non-TTSOpsMinimalTuple scenarios, it can avoid + * reform(materialize) tuple(see ExecForceStoreMinimalTuple). + */ + ExecForceStoreMinimalTuple(mtup, slot, false); return slot; + } } /* End of this batch */ @@ -1133,7 +1161,6 @@ ExecHashJoinNewBatch(HashJoinState *hjstate) int nbatch; int curbatch; BufFile *innerFile; - TupleTableSlot *slot; uint32 hashvalue; nbatch = hashtable->nbatch; @@ -1224,21 +1251,25 @@ ExecHashJoinNewBatch(HashJoinState *hjstate) if (innerFile != NULL) { + StringInfoData buf; + MinimalTuple tuple; + if (BufFileSeek(innerFile, 0, 0, SEEK_SET)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not rewind hash-join temporary file"))); - while ((slot = ExecHashJoinGetSavedTuple(hjstate, - innerFile, - &hashvalue, - hjstate->hj_HashTupleSlot))) + initStringInfo(&buf); + while ((tuple = ExecHashJoinGetSavedTuple(hjstate, + innerFile, + &hashvalue, + &buf))) { /* * NOTE: some tuples may be sent to future batches. Also, it is * possible for hashtable->nbatch to be increased here! */ - ExecHashTableInsert(hashtable, slot, hashvalue); + ExecHashTableInsertTuple(hashtable, tuple, hashvalue); } /* @@ -1247,6 +1278,7 @@ ExecHashJoinNewBatch(HashJoinState *hjstate) */ BufFileClose(innerFile); hashtable->innerBatchFile[curbatch] = NULL; + pfree(buf.data); } /* @@ -1297,7 +1329,6 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) { uint32 hashvalue; MinimalTuple tuple; - TupleTableSlot *slot; if (!hashtable->batches[batchno].done) { @@ -1329,12 +1360,9 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate) while ((tuple = sts_parallel_scan_next(inner_tuples, &hashvalue))) { - ExecForceStoreMinimalTuple(tuple, - hjstate->hj_HashTupleSlot, - false); - slot = hjstate->hj_HashTupleSlot; - ExecParallelHashTableInsertCurrentBatch(hashtable, slot, - hashvalue); + ExecParallelHashTableInsertTupleCurrentBatch(hashtable, + tuple, + hashvalue); } sts_end_parallel_scan(inner_tuples); BarrierArriveAndWait(batch_barrier, @@ -1448,14 +1476,14 @@ ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue, * ExecHashJoinGetSavedTuple * read the next tuple from a batch file. Return NULL if no more. * - * On success, *hashvalue is set to the tuple's hash value, and the tuple - * itself is stored in the given slot. + * On success, *hashvalue is set to the tuple's hash value, and return + * the tuple(stored in the given buf) itself. */ -static TupleTableSlot * +static MinimalTuple ExecHashJoinGetSavedTuple(HashJoinState *hjstate, BufFile *file, uint32 *hashvalue, - TupleTableSlot *tupleSlot) + StringInfo buf) { uint32 header[2]; size_t nread; @@ -1474,19 +1502,20 @@ ExecHashJoinGetSavedTuple(HashJoinState *hjstate, * cheating. */ nread = BufFileReadMaybeEOF(file, header, sizeof(header), true); - if (nread == 0) /* end of file */ - { - ExecClearTuple(tupleSlot); + if (unlikely(nread == 0)) /* end of file */ return NULL; - } + + buf->len = 0; /* inline resetStringInfo(buf); */ + enlargeStringInfo(buf, header[1]); + buf->len = header[1]; *hashvalue = header[0]; - tuple = (MinimalTuple) palloc(header[1]); + tuple = (MinimalTuple) buf->data; tuple->t_len = header[1]; BufFileReadExact(file, (char *) tuple + sizeof(uint32), header[1] - sizeof(uint32)); - ExecForceStoreMinimalTuple(tuple, tupleSlot, true); - return tupleSlot; + + return tuple; } diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h index e4eb7bc635..6cc2264692 100644 --- a/src/include/executor/nodeHash.h +++ b/src/include/executor/nodeHash.h @@ -33,15 +33,21 @@ extern void ExecHashTableDetachBatch(HashJoinTable hashtable); extern void ExecParallelHashTableSetCurrentBatch(HashJoinTable hashtable, int batchno); -extern void ExecHashTableInsert(HashJoinTable hashtable, - TupleTableSlot *slot, - uint32 hashvalue); +extern void ExecHashTableInsertSlot(HashJoinTable hashtable, + TupleTableSlot *slot, + uint32 hashvalue); +extern void ExecHashTableInsertTuple(HashJoinTable hashtable, + MinimalTuple tuple, + uint32 hashvalue); extern void ExecParallelHashTableInsert(HashJoinTable hashtable, TupleTableSlot *slot, uint32 hashvalue); -extern void ExecParallelHashTableInsertCurrentBatch(HashJoinTable hashtable, - TupleTableSlot *slot, - uint32 hashvalue); +extern void ExecParallelHashTableInsertSlotCurrentBatch(HashJoinTable hashtable, + TupleTableSlot *slot, + uint32 hashvalue); +extern void ExecParallelHashTableInsertTupleCurrentBatch(HashJoinTable hashtable, + MinimalTuple tuple, + uint32 hashvalue); extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable, uint32 hashvalue, int *bucketno, diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index af7d8fd1e7..299f66af13 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -2225,6 +2225,7 @@ typedef struct HashJoinState TupleTableSlot *hj_NullOuterTupleSlot; TupleTableSlot *hj_NullInnerTupleSlot; TupleTableSlot *hj_FirstOuterTupleSlot; + StringInfo hj_outerTupleBuffer; int hj_JoinState; bool hj_MatchedOuter; bool hj_OuterNotEmpty;