Help with Join Performance Testing
Started by Lawrence, Ramonalmost 17 years ago1 messages
A hash join modification patch is under review for 8.4 that needs
performance testing. We would appreciate help with this testing.
A testing version of the patch is attached in addition to testing
instructions and where to retrieve a sample data set. The basic idea
of the patch is that it reduces disk operations for large multi-batch
hash joins where there is skew in the probe relation. The patch
collects statistics on performance benefits when using the optimization.
--
Ramon Lawrence and Bryce Cutt
Attachments:
histojoin_testing.patchapplication/octet-stream; name=histojoin_testing.patchDownload
Index: src/backend/executor/nodeHash.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeHash.c,v
retrieving revision 1.117
diff -c -r1.117 nodeHash.c
*** src/backend/executor/nodeHash.c 1 Jan 2009 17:23:41 -0000 1.117
--- src/backend/executor/nodeHash.c 14 Jan 2009 06:36:51 -0000
***************
*** 53,58 ****
--- 53,224 ----
return NULL;
}
+ /*
+ * ----------------------------------------------------------------
+ * ExecHashGetIMBucket
+ *
+ * Returns the index of the in-memory bucket for this
+ * hashvalue, or IM_INVALID_BUCKET if the hashvalue is not
+ * associated with any unfrozen bucket (or if skew
+ * optimization is not being used).
+ *
+ * It is possible for a tuple whose join attribute value is
+ * not a MCV to hash to an in-memory bucket due to the limited
+ * number of hash values but it is unlikely and everything
+ * continues to work even if it does happen. We would
+ * accidentally cache some less optimal tuples in memory
+ * but the join result would still be accurate.
+ *
+ * hashtable->imBucket is an open addressing hashtable of
+ * in-memory buckets (HashJoinIMBucket).
+ * ----------------------------------------------------------------
+ */
+ int
+ ExecHashGetIMBucket(HashJoinTable hashtable, uint32 hashvalue)
+ {
+ int bucket;
+
+ if (!hashtable->enableSkewOptimization)
+ return IM_INVALID_BUCKET;
+
+ /* Modulo the hashvalue (using bitmask) to find the IM bucket. */
+ bucket = hashvalue & (hashtable->nIMBuckets - 1);
+
+ /*
+ * While we have not hit a hole in the hashtable and have not hit the
+ * actual bucket we have collided in the hashtable so try the next
+ * bucket location.
+ */
+ while (hashtable->imBucket[bucket] != NULL
+ && hashtable->imBucket[bucket]->hashvalue != hashvalue)
+ bucket = (bucket + 1) & (hashtable->nIMBuckets - 1);
+
+ /*
+ * If the bucket exists and has been correctly determined return
+ * the bucket index.
+ */
+ if (hashtable->imBucket[bucket] != NULL
+ && hashtable->imBucket[bucket]->hashvalue == hashvalue)
+ return bucket;
+
+ /*
+ * Must have run into an empty location or a frozen bucket which means the
+ * tuple with this hashvalue is not to be handled as if it matches with an
+ * in-memory bucket.
+ */
+ return IM_INVALID_BUCKET;
+ }
+
+ /*
+ * ----------------------------------------------------------------
+ * ExecHashFreezeNextIMBucket
+ *
+ * Freeze the tuples of the next in-memory bucket by pushing
+ * them into the main hashtable. Buckets are frozen in order
+ * so that the best tuples are kept in memory the longest.
+ * ----------------------------------------------------------------
+ */
+ static bool
+ ExecHashFreezeNextIMBucket(HashJoinTable hashtable)
+ {
+ int bucketToFreeze;
+ int bucketno;
+ int batchno;
+ uint32 hashvalue;
+ HashJoinTuple hashTuple;
+ HashJoinTuple nextHashTuple;
+ HashJoinIMBucket *bucket;
+ MinimalTuple mintuple;
+
+ /* Calculate the imBucket index of the bucket to freeze. */
+ bucketToFreeze = hashtable->imBucketFreezeOrder
+ [hashtable->nUsedIMBuckets - 1 - hashtable->nIMBucketsFrozen];
+
+ /* Grab a pointer to the actual IM bucket. */
+ bucket = hashtable->imBucket[bucketToFreeze];
+ hashvalue = bucket->hashvalue;
+
+ /*
+ * Grab a pointer to the first tuple in the soon to be frozen IM bucket.
+ */
+ hashTuple = bucket->tuples;
+
+ /*
+ * Calculate which bucket and batch the tuples belong to in the main
+ * non-IM hashtable.
+ */
+ ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
+
+ /* until we have read all tuples from this bucket */
+ while (hashTuple != NULL)
+ {
+ /*
+ * Some of this code is very similar to that of ExecHashTableInsert.
+ * We do not call ExecHashTableInsert directly as
+ * ExecHashTableInsert expects a TupleTableSlot and we already have
+ * HashJoinTuples.
+ */
+ mintuple = HJTUPLE_MINTUPLE(hashTuple);
+
+ hashtable->nInnerIMTupFrozen++;
+
+ /* Decide whether to put the tuple in the hash table or a temp file. */
+ if (batchno == hashtable->curbatch)
+ {
+ /* Put the tuple in hash table. */
+ nextHashTuple = hashTuple->next;
+ hashTuple->next = hashtable->buckets[bucketno];
+ hashtable->buckets[bucketno] = hashTuple;
+ hashTuple = nextHashTuple;
+ hashtable->spaceUsedIM -= HJTUPLE_OVERHEAD + mintuple->t_len;
+ }
+ else
+ {
+ /* Put the tuples into a temp file for later batches. */
+ Assert(batchno > hashtable->curbatch);
+ ExecHashJoinSaveTuple(mintuple, hashvalue,
+ &hashtable->innerBatchFile[batchno]);
+ /*
+ * Some memory has been freed up. This must be done before we
+ * pfree the hashTuple of we lose access to the tuple size.
+ */
+ hashtable->spaceUsed -= HJTUPLE_OVERHEAD + mintuple->t_len;
+ hashtable->spaceUsedIM -= HJTUPLE_OVERHEAD + mintuple->t_len;
+ nextHashTuple = hashTuple->next;
+ pfree(hashTuple);
+ hashTuple = nextHashTuple;
+ }
+ }
+
+ /*
+ * Free the memory the bucket struct was using as it is not necessary
+ * any more. All code treats a frozen in-memory bucket the same as one
+ * that did not exist; by setting the pointer to null the rest of the code
+ * will function as if we had not created this bucket at all.
+ */
+ pfree(bucket);
+ hashtable->imBucket[bucketToFreeze] = NULL;
+ hashtable->spaceUsed -= IM_BUCKET_OVERHEAD;
+ hashtable->spaceUsedIM -= IM_BUCKET_OVERHEAD;
+ hashtable->nIMBucketsFrozen++;
+
+ /*
+ * All buckets have been frozen and deleted from memory so turn off
+ * skew aware partitioning and remove the structs from memory as they are
+ * just wasting space from now on.
+ */
+ if (hashtable->nUsedIMBuckets == hashtable->nIMBucketsFrozen)
+ {
+ hashtable->enableSkewOptimization = false;
+ pfree(hashtable->imBucket);
+ pfree(hashtable->imBucketFreezeOrder);
+ hashtable->spaceUsed -= hashtable->spaceUsedIM;
+ hashtable->spaceUsedIM = 0;
+ }
+
+ return true;
+ }
+
/* ----------------------------------------------------------------
* MultiExecHash
*
***************
*** 69,74 ****
--- 235,242 ----
TupleTableSlot *slot;
ExprContext *econtext;
uint32 hashvalue;
+ MinimalTuple mintuple;
+ int bucketNumber;
/* must provide our own instrumentation support */
if (node->ps.instrument)
***************
*** 99,106 ****
if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, false,
&hashvalue))
{
! ExecHashTableInsert(hashtable, slot, hashvalue);
! hashtable->totalTuples += 1;
}
}
--- 267,322 ----
if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, false,
&hashvalue))
{
! int bucketno;
! int batchno;
!
! bucketNumber = ExecHashGetIMBucket(hashtable, hashvalue);
!
! ExecHashGetBucketAndBatch(hashtable, hashvalue,
! &bucketno, &batchno);
! if (batchno == 0)
! {
! hashtable->nInnerPotentialBatchZeroTup++;
! if (bucketNumber == IM_INVALID_BUCKET)
! hashtable->nInnerBatchZeroTup++;
! }
!
! /* handle tuples not destined for an in-memory bucket normally */
! if (bucketNumber == IM_INVALID_BUCKET)
! ExecHashTableInsert(hashtable, slot, hashvalue);
! else
! {
! HashJoinTuple hashTuple;
! int hashTupleSize;
!
! /* get the HashJoinTuple */
! mintuple = ExecFetchSlotMinimalTuple(slot);
! hashTupleSize = HJTUPLE_OVERHEAD + mintuple->t_len;
! hashTuple = (HashJoinTuple)
! MemoryContextAlloc(hashtable->batchCxt, hashTupleSize);
! hashTuple->hashvalue = hashvalue;
! memcpy(HJTUPLE_MINTUPLE(hashTuple), mintuple, mintuple->t_len);
!
! /* Push the HashJoinTuple onto the front of the IM bucket. */
! hashTuple->next = hashtable->imBucket[bucketNumber]->tuples;
! hashtable->imBucket[bucketNumber]->tuples = hashTuple;
!
! /*
! * More memory is now in use so make sure we are not over
! * spaceAllowedIM.
! */
! hashtable->spaceUsed += hashTupleSize;
! hashtable->spaceUsedIM += hashTupleSize;
! hashtable->nInnerIMTup++;
! while (hashtable->spaceUsedIM > hashtable->spaceAllowedIM
! && ExecHashFreezeNextIMBucket(hashtable))
! ;
! /* Guarantee we are not over the spaceAllowed. */
! if (hashtable->spaceUsed > hashtable->spaceAllowed)
! ExecHashIncreaseNumBatches(hashtable);
! }
! hashtable->totalTuples++;
! hashtable->nInnerTup++;
}
}
***************
*** 269,274 ****
--- 485,512 ----
hashtable->outerBatchFile = NULL;
hashtable->spaceUsed = 0;
hashtable->spaceAllowed = work_mem * 1024L;
+ /* Initialize skew optimization related hashtable variables. */
+ hashtable->spaceUsedIM = 0;
+ hashtable->spaceAllowedIM
+ = hashtable->spaceAllowed * IM_WORK_MEM_PERCENT / 100;
+ hashtable->enableSkewOptimization = false;
+ hashtable->nUsedIMBuckets = 0;
+ hashtable->nIMBuckets = 0;
+ hashtable->imBucket = NULL;
+ hashtable->nIMBucketsFrozen = 0;
+ hashtable->nInnerTup = 0;
+ hashtable->nOuterTup = 0;
+ hashtable->nInnerIMTup = 0;
+ hashtable->nOuterIMTup = 0;
+ hashtable->nOutputTup = 0;
+ hashtable->nOutputIMTup = 0;
+ hashtable->nInnerIMTupFrozen = 0;
+ hashtable->nOuterBatchZeroTup = 0;
+ hashtable->nOuterPotentialBatchZeroTup = 0;
+ hashtable->nOutputBatchZeroTup = 0;
+ hashtable->nOutputPotentialBatchZeroTup = 0;
+ hashtable->nInnerBatchZeroTup = 0;
+ hashtable->nInnerPotentialBatchZeroTup = 0;
/*
* Get info about the hash functions to be used for each hash key. Also
***************
*** 448,453 ****
--- 686,754 ----
{
int i;
+ /* the total # of inner tuples received by join */
+ ereport(NOTICE, (errmsg("Total Inner Tuples: %d", hashtable->nInnerTup)));
+ /* # inner tuples in the IM buckets */
+ ereport(NOTICE, (errmsg("IM Inner Tuples: %d", hashtable->nInnerIMTup)));
+ /* # inner tuples that fell in batch 0 */
+ ereport(NOTICE, (errmsg("Batch Zero Inner Tuples: %d",
+ hashtable->nInnerBatchZeroTup)));
+ /*
+ * # inner tuples that would have fallen in batch 0 if IM buckets were
+ * not in use at all
+ */
+ ereport(NOTICE, (errmsg("Batch Zero Potential Inner Tuples: %d",
+ hashtable->nInnerPotentialBatchZeroTup)));
+ /* total # of outer tuples received by join */
+ ereport(NOTICE, (errmsg("Total Outer Tuples: %d", hashtable->nOuterTup)));
+ /* # outer tuples that matched with the IM buckets */
+ ereport(NOTICE, (errmsg("IM Outer Tuples: %d", hashtable->nOuterIMTup)));
+ /* # outer tuples that matched with batch 0 */
+ ereport(NOTICE, (errmsg("Batch Zero Outer Tuples: %d",
+ hashtable->nOuterBatchZeroTup)));
+ /*
+ * # outer tuples that would have fallen in batch 0 if IM buckets were
+ * not in use at all
+ */
+ ereport(NOTICE, (errmsg("Batch Zero Potential Outer Tuples: %d",
+ hashtable->nOuterPotentialBatchZeroTup)));
+ /* total # output tuples produced by join */
+ ereport(NOTICE, (errmsg("Total Output Tuples: %d",
+ hashtable->nOutputTup)));
+ /* # output tuples that came from matches with IM bucket inner tuples */
+ ereport(NOTICE, (errmsg("IM Output Tuples: %d",
+ hashtable->nOutputIMTup)));
+ /* # output tuples that came from matches with batch 0 inner tuples */
+ ereport(NOTICE, (errmsg("Batch Zero Output Tuples: %d",
+ hashtable->nOutputBatchZeroTup)));
+ /*
+ * # output tuples that would have came from matches with batch 0 if IM
+ * buckets were not in use at all
+ */
+ ereport(NOTICE, (errmsg("Batch Zero Potential Output Tuples: %d",
+ hashtable->nOutputPotentialBatchZeroTup)));
+ /*
+ * # of inner IM tuples that were frozen back to the main hashtable when
+ * an IM bucket was frozen
+ */
+ ereport(NOTICE, (errmsg("IM Tuples Frozen: %d",
+ hashtable->nInnerIMTupFrozen)));
+ /* percentage less tuple IOs compared to HHJ due to skew optimization */
+ if (hashtable->nInnerTup - hashtable->nInnerPotentialBatchZeroTup
+ + hashtable->nOuterTup - hashtable->nOuterPotentialBatchZeroTup
+ != 0)
+ ereport(NOTICE, (errmsg("Percentage less tuple IOs than HHJ: %4.2f",
+ (1 - (
+ (float)(hashtable->nInnerTup - hashtable->nInnerIMTup
+ - hashtable->nInnerBatchZeroTup
+ + hashtable->nOuterTup - hashtable->nOuterIMTup
+ - hashtable->nOuterBatchZeroTup)
+ /
+ (hashtable->nInnerTup - hashtable->nInnerPotentialBatchZeroTup
+ + hashtable->nOuterTup
+ - hashtable->nOuterPotentialBatchZeroTup))
+ ) * 100)));
+
/*
* Make sure all the temp files are closed. We skip batch 0, since it
* can't have any temp files (and the arrays might not even exist if
***************
*** 815,825 ****
/*
* hj_CurTuple is NULL to start scanning a new bucket, or the address of
* the last tuple returned from the current bucket.
*/
! if (hashTuple == NULL)
! hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
! else
hashTuple = hashTuple->next;
while (hashTuple != NULL)
{
--- 1116,1132 ----
/*
* hj_CurTuple is NULL to start scanning a new bucket, or the address of
* the last tuple returned from the current bucket.
+ *
+ * If the tuple hashed to an IM bucket then scan the IM bucket
+ * otherwise scan the standard hashtable bucket.
*/
! if (hashTuple != NULL)
hashTuple = hashTuple->next;
+ else if (hjstate->hj_OuterTupleIMBucketNo != IM_INVALID_BUCKET)
+ hashTuple = hashtable->imBucket[hjstate->hj_OuterTupleIMBucketNo]
+ ->tuples;
+ else
+ hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
while (hashTuple != NULL)
{
Index: src/backend/executor/nodeHashjoin.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v
retrieving revision 1.97
diff -c -r1.97 nodeHashjoin.c
*** src/backend/executor/nodeHashjoin.c 1 Jan 2009 17:23:41 -0000 1.97
--- src/backend/executor/nodeHashjoin.c 13 Jan 2009 21:28:41 -0000
***************
*** 20,25 ****
--- 20,30 ----
#include "executor/nodeHash.h"
#include "executor/nodeHashjoin.h"
#include "utils/memutils.h"
+ #include "utils/syscache.h"
+ #include "utils/lsyscache.h"
+ #include "parser/parsetree.h"
+ #include "catalog/pg_statistic.h"
+ #include "optimizer/cost.h"
/* Returns true for JOIN_LEFT and JOIN_ANTI jointypes */
***************
*** 34,39 ****
--- 39,246 ----
TupleTableSlot *tupleSlot);
static int ExecHashJoinNewBatch(HashJoinState *hjstate);
+ /*
+ * ----------------------------------------------------------------
+ * ExecHashJoinDetectSkew
+ *
+ * If MCV statistics can be found for the join attribute of
+ * this hashjoin then create a hash table of buckets. Each
+ * bucket will correspond to an MCV hashvalue and will be
+ * filled with inner relation tuples whose join attribute
+ * hashes to the same value as that MCV. If a join
+ * attribute value is a MCV for the join attribute in the
+ * outer (probe) relation, tuples with this value in the
+ * inner (build) relation are more likely to join with outer
+ * relation tuples and a benefit can be gained by keeping
+ * them in memory while joining the first batch of tuples.
+ * ----------------------------------------------------------------
+ */
+ static void
+ ExecHashJoinDetectSkew(EState *estate, HashJoinState *hjstate, int tupwidth)
+ {
+ HeapTupleData *statsTuple;
+ FuncExprState *clause;
+ ExprState *argstate;
+ Var *variable;
+ HashJoinTable hashtable;
+ Datum *values;
+ int nvalues;
+ float4 *numbers;
+ int nnumbers;
+ Oid relid;
+ Oid atttype;
+ int i;
+ int mcvsToUse;
+
+ PlanState *outerNode = outerPlanState(hjstate);
+
+ /* Only use statistics if there is a single join attribute. */
+ if (hjstate->hashclauses->length != 1)
+ return; /* Histojoin is not defined for more than one join key */
+
+ hashtable = hjstate->hj_HashTable;
+
+ /*
+ * Estimate the number of IM buckets that will fit in
+ * the memory allowed for IM buckets.
+ *
+ * hashtable->imBucket will have up to 8 times as many HashJoinIMBucket
+ * pointers as the number of MCV hashvalues. A uint16 index in
+ * hashtable->imBucketFreezeOrder will be created for each IM bucket. One
+ * actual HashJoinIMBucket struct will be created for each
+ * unique MCV hashvalue so up to one struct per MCV.
+ *
+ * It is also estimated that each IM bucket will have a single build
+ * tuple stored in it after partitioning the build relation input. This
+ * estimate could be high if tuples are filtered out before this join but
+ * in that case the extra memory is used by the regular hashjoin batch.
+ * This estimate could be low if it is a many to many join but in that
+ * case IM buckets will be frozen to free up memory as needed
+ * during the inner relation partitioning phase.
+ */
+ mcvsToUse = hashtable->spaceAllowedIM / (
+ /* size of a hash tuple */
+ HJTUPLE_OVERHEAD + MAXALIGN(sizeof(MinimalTupleData))
+ + MAXALIGN(tupwidth)
+ /* max size of hashtable pointers per MCV */
+ + (8 * sizeof(HashJoinIMBucket*))
+ + sizeof(uint16) /* size of imBucketFreezeOrder entry */
+ + IM_BUCKET_OVERHEAD /* size of IM bucket struct */
+ );
+ if (mcvsToUse == 0)
+ return; /* No point in considering this any further. */
+
+ /*
+ * Determine the relation id and attribute id of the single join
+ * attribute of the probe relation.
+ */
+ clause = (FuncExprState *) lfirst(list_head(hjstate->hashclauses));
+ argstate = (ExprState *) lfirst(list_head(clause->args));
+
+ /*
+ * Do not try to exploit stats if the join attribute is an expression
+ * instead of just a simple attribute.
+ */
+ if (argstate->expr->type != T_Var)
+ return;
+
+ variable = (Var *) argstate->expr;
+ relid = getrelid(variable->varnoold, estate->es_range_table);
+ atttype = variable->vartype;
+
+ statsTuple = SearchSysCache(STATRELATT, ObjectIdGetDatum(relid),
+ Int16GetDatum(variable->varoattno), 0, 0);
+ if (!HeapTupleIsValid(statsTuple))
+ return;
+
+ /* Look for MCV statistics for the attribute. */
+ if (get_attstatsslot(statsTuple, atttype, variable->vartypmod,
+ STATISTIC_KIND_MCV, InvalidOid, &values, &nvalues,
+ &numbers, &nnumbers))
+ {
+ FmgrInfo *hashfunctions;
+ int nbuckets = 2;
+ double frac = 0;
+
+ /*
+ * IM buckets (imBucket) is an open addressing hashtable with a
+ * power of 2 size that is greater than the number of MCV values.
+ */
+ if (mcvsToUse > nvalues)
+ mcvsToUse = nvalues;
+
+ for (i = 0; i < mcvsToUse; i++)
+ frac += numbers[i];
+
+ ereport(NOTICE, (errmsg("Values: %d Skew: %4.2f Est. tuples: %4.2f Batches: %d Est. Save: %4.2f",
+ nvalues, frac, outerNode->plan->plan_rows, hashtable->nbatch,
+ (frac*(1-1.0/hashtable->nbatch)*outerNode->plan->plan_rows))));
+
+ if (frac < IM_MIN_BENEFIT_PERCENT)
+ {
+ free_attstatsslot(atttype, values, nvalues, numbers, nnumbers);
+ ReleaseSysCache(statsTuple);
+ return;
+ }
+
+ while (nbuckets <= mcvsToUse)
+ nbuckets <<= 1;
+ /* use two more bits just to help avoid collisions */
+ nbuckets <<= 2;
+ hashtable->nIMBuckets = nbuckets;
+ hashtable->enableSkewOptimization = true;
+
+ /*
+ * Allocate the bucket memory in the hashtable's batch context
+ * because it is only relevant and necessary during the first batch
+ * and will be nicely removed once the first batch is done.
+ */
+ hashtable->imBucket =
+ MemoryContextAllocZero(hashtable->batchCxt,
+ nbuckets * sizeof(HashJoinIMBucket*));
+ hashtable->imBucketFreezeOrder =
+ MemoryContextAllocZero(hashtable->batchCxt,
+ mcvsToUse * sizeof(uint16));
+ /* Count the overhead of the IM pointers immediately. */
+ hashtable->spaceUsed += nbuckets * sizeof(HashJoinIMBucket*)
+ + mcvsToUse * sizeof(uint16);
+ hashtable->spaceUsedIM += nbuckets * sizeof(HashJoinIMBucket*)
+ + mcvsToUse * sizeof(uint16);
+
+ /*
+ * Grab the hash functions as we will be generating the hashvalues
+ * in this section.
+ */
+ hashfunctions = hashtable->outer_hashfunctions;
+
+ /* Create the buckets */
+ for (i = 0; i < mcvsToUse; i++)
+ {
+ uint32 hashvalue = DatumGetUInt32(
+ FunctionCall1(&hashfunctions[0], values[i]));
+ int bucket = hashvalue & (nbuckets - 1);
+
+ /*
+ * While we have not hit a hole in the hashtable and have not hit
+ * a bucket with the same hashvalue we have collided in the
+ * hashtable so try the next bucket location (remember it is an
+ * open addressing hashtable).
+ */
+ while (hashtable->imBucket[bucket] != NULL
+ && hashtable->imBucket[bucket]->hashvalue != hashvalue)
+ bucket = (bucket + 1) & (nbuckets - 1);
+
+ /*
+ * Leave bucket alone if it has the same hashvalue as current
+ * MCV. We only want one bucket per hashvalue. Even if two MCV
+ * values hash to the same bucket we are fine.
+ */
+ if (hashtable->imBucket[bucket] == NULL)
+ {
+ /*
+ * Allocate the actual bucket structure in the hashtable's batch
+ * context because it is only relevant and necessary during
+ * the first batch and will be nicely removed once the first
+ * batch is done.
+ */
+ hashtable->imBucket[bucket]
+ = MemoryContextAllocZero(hashtable->batchCxt,
+ sizeof(HashJoinIMBucket));
+ hashtable->imBucket[bucket]->hashvalue = hashvalue;
+ hashtable->imBucketFreezeOrder[hashtable->nUsedIMBuckets]
+ = bucket;
+ hashtable->nUsedIMBuckets++;
+ /* Count the overhead of the IM bucket struct */
+ hashtable->spaceUsed += IM_BUCKET_OVERHEAD;
+ hashtable->spaceUsedIM += IM_BUCKET_OVERHEAD;
+ }
+ }
+
+ free_attstatsslot(atttype, values, nvalues, numbers, nnumbers);
+ }
+
+ ReleaseSysCache(statsTuple);
+ }
/* ----------------------------------------------------------------
* ExecHashJoin
***************
*** 147,152 ****
--- 354,364 ----
node->hj_HashOperators);
node->hj_HashTable = hashtable;
+ /* Use skew optimization only when there is more than one batch. */
+ if (hashtable->nbatch > 1 && enable_hashjoin_usestatmcvs)
+ ExecHashJoinDetectSkew(estate, node,
+ (outerPlan((Hash *) hashNode->ps.plan))->plan_width);
+
/*
* execute the Hash node, to build the hash table
*/
***************
*** 205,216 ****
ExecHashGetBucketAndBatch(hashtable, hashvalue,
&node->hj_CurBucketNo, &batchno);
node->hj_CurTuple = NULL;
/*
* Now we've got an outer tuple and the corresponding hash bucket,
! * but this tuple may not belong to the current batch.
*/
! if (batchno != hashtable->curbatch)
{
/*
* Need to postpone this outer tuple to a later batch. Save it
--- 417,440 ----
ExecHashGetBucketAndBatch(hashtable, hashvalue,
&node->hj_CurBucketNo, &batchno);
node->hj_CurTuple = NULL;
+
+ /* Does the outer tuple match an IM bucket? */
+ node->hj_OuterTupleIMBucketNo =
+ ExecHashGetIMBucket(hashtable, hashvalue);
+ if (node->hj_OuterTupleIMBucketNo != IM_INVALID_BUCKET)
+ hashtable->nOuterIMTup++;
+ else if (batchno == 0)
+ hashtable->nOuterBatchZeroTup++;
+ if (batchno == 0)
+ hashtable->nOuterPotentialBatchZeroTup++;
/*
* Now we've got an outer tuple and the corresponding hash bucket,
! * but in might not belong to the current batch, or it might need
! * to go into an in-memory bucket.
*/
! if (node->hj_OuterTupleIMBucketNo == IM_INVALID_BUCKET
! && batchno != hashtable->curbatch)
{
/*
* Need to postpone this outer tuple to a later batch. Save it
***************
*** 281,286 ****
--- 505,517 ----
{
node->js.ps.ps_TupFromTlist =
(isDone == ExprMultipleResult);
+ hashtable->nOutputTup++;
+ if (node->hj_OuterTupleIMBucketNo != IM_INVALID_BUCKET)
+ hashtable->nOutputIMTup++;
+ else if (batchno == 0)
+ hashtable->nOutputBatchZeroTup++;
+ if (batchno == 0)
+ hashtable->nOutputPotentialBatchZeroTup++;
return result;
}
}
***************
*** 582,587 ****
--- 813,819 ----
{
/* remember outer relation is not empty for possible rescan */
hjstate->hj_OuterNotEmpty = true;
+ hashtable->nOuterTup++;
return slot;
}
***************
*** 641,647 ****
nbatch = hashtable->nbatch;
curbatch = hashtable->curbatch;
! if (curbatch > 0)
{
/*
* We no longer need the previous outer batch file; close it right
--- 873,899 ----
nbatch = hashtable->nbatch;
curbatch = hashtable->curbatch;
! /* if we just finished the first batch */
! if (curbatch == 0)
! {
! /*
! * Reset some of the skew optimization state variables. IM buckets are
! * no longer being used as of this point because they are only
! * necessary while joining the first batch (before the cleanup phase).
! *
! * Especially need to make sure ExecHashGetIMBucket returns
! * IM_INVALID_BUCKET quickly for all subsequent calls.
! *
! * IM buckets are only taking up memory if this is a multi-batch join
! * and in that case ExecHashTableReset is about to be called which
! * will free all memory currently used by IM buckets and tuples when
! * it deletes hashtable->batchCxt. If this is a single batch join
! * then imBucket and imBucketFreezeOrder are already NULL and empty.
! */
! hashtable->enableSkewOptimization = false;
! hashtable->spaceUsedIM = 0;
! }
! else if (curbatch > 0)
{
/*
* We no longer need the previous outer batch file; close it right
Index: src/backend/optimizer/path/costsize.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v
retrieving revision 1.203
diff -c -r1.203 costsize.c
*** src/backend/optimizer/path/costsize.c 1 Jan 2009 17:23:43 -0000 1.203
--- src/backend/optimizer/path/costsize.c 13 Jan 2009 08:02:42 -0000
***************
*** 108,113 ****
--- 108,114 ----
bool enable_nestloop = true;
bool enable_mergejoin = true;
bool enable_hashjoin = true;
+ bool enable_hashjoin_usestatmcvs = true;
typedef struct
{
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.493
diff -c -r1.493 guc.c
*** src/backend/utils/misc/guc.c 12 Jan 2009 05:10:44 -0000 1.493
--- src/backend/utils/misc/guc.c 13 Jan 2009 08:03:33 -0000
***************
*** 656,661 ****
--- 656,669 ----
true, NULL, NULL
},
{
+ {"enable_hashjoin_usestatmcvs", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("Enables the hash join's use of the MCVs stored in pg_statistic."),
+ NULL
+ },
+ &enable_hashjoin_usestatmcvs,
+ true, NULL, NULL
+ },
+ {
{"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
gettext_noop("Enables genetic query optimization."),
gettext_noop("This algorithm attempts to do planning without "
Index: src/include/executor/hashjoin.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/executor/hashjoin.h,v
retrieving revision 1.49
diff -c -r1.49 hashjoin.h
*** src/include/executor/hashjoin.h 1 Jan 2009 17:23:59 -0000 1.49
--- src/include/executor/hashjoin.h 14 Jan 2009 06:42:53 -0000
***************
*** 72,77 ****
--- 72,97 ----
#define HJTUPLE_MINTUPLE(hjtup) \
((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
+ /*
+ * Stores a hashvalue and linked list of tuples that share that hashvalue.
+ *
+ * When processing MCVs to detect skew in the probe relation of a hash join
+ * the hashvalue is generated and stored before any tuples have been read
+ * (see ExecHashJoinDetectSkew).
+ *
+ * Build tuples that hash to the same hashvalue are placed in the bucket while
+ * reading the build relation.
+ */
+ typedef struct HashJoinIMBucket
+ {
+ uint32 hashvalue;
+ HashJoinTuple tuples;
+ } HashJoinIMBucket;
+
+ #define IM_INVALID_BUCKET -1
+ #define IM_WORK_MEM_PERCENT 2
+ #define IM_MIN_BENEFIT_PERCENT .01
+ #define IM_BUCKET_OVERHEAD MAXALIGN(sizeof(HashJoinIMBucket))
typedef struct HashJoinTableData
{
***************
*** 113,121 ****
--- 133,201 ----
Size spaceUsed; /* memory space currently used by tuples */
Size spaceAllowed; /* upper limit for space used */
+ /* memory space currently used by IM buckets and tuples */
+ Size spaceUsedIM;
+ /* upper limit for space used by IM buckets and tuples */
+ Size spaceAllowedIM;
MemoryContext hashCxt; /* context for whole-hash-join storage */
MemoryContext batchCxt; /* context for this-batch-only storage */
+
+ /* will the join optimize memory usage when probe relation is skewed */
+ bool enableSkewOptimization;
+ HashJoinIMBucket **imBucket; /* hashtable of IM buckets */
+ /*
+ * array of imBucket indexes to the created IM buckets sorted
+ * in the opposite order that they would be frozen to disk
+ */
+ uint16 *imBucketFreezeOrder;
+ int nIMBuckets; /* # of buckets in the IM buckets hashtable */
+ /*
+ * # of used buckets in the IM buckets hashtable and length of
+ * imBucketFreezeOrder array
+ */
+ int nUsedIMBuckets;
+ /* # of IM buckets that have already been frozen to disk */
+ int nIMBucketsFrozen;
+
+ /* the total # of inner tuples received by join */
+ uint32 nInnerTup;
+ /* total # of outer tuples received by join */
+ uint32 nOuterTup;
+ /* # inner tuples in the IM buckets */
+ uint32 nInnerIMTup;
+ /* # outer tuples that matched with the IM buckets */
+ uint32 nOuterIMTup;
+ /* total # output tuples produced by join */
+ uint32 nOutputTup;
+ /* # output tuples that came from matches with IM bucket inner tuples */
+ uint32 nOutputIMTup;
+ /*
+ * # of inner IM tuples that were frozen back to the main hashtable when
+ * an IM bucket was frozen
+ */
+ uint32 nInnerIMTupFrozen;
+ /* # outer tuples that matched with batch 0 */
+ uint32 nOuterBatchZeroTup;
+ /*
+ * # outer tuples that would have fallen in batch 0 if IM buckets were
+ * not in use at all
+ */
+ uint32 nOuterPotentialBatchZeroTup;
+ /* # output tuples that came from matches with batch 0 inner tuples */
+ uint32 nOutputBatchZeroTup;
+ /*
+ * # output tuples that would have came from matches with batch 0 if IM
+ * buckets were not in use at all
+ */
+ uint32 nOutputPotentialBatchZeroTup;
+ /* # inner tuples that fell in batch 0 */
+ uint32 nInnerBatchZeroTup;
+ /*
+ * # inner tuples that would have fallen in batch 0 if IM buckets were
+ * not in use at all
+ */
+ uint32 nInnerPotentialBatchZeroTup;
} HashJoinTableData;
#endif /* HASHJOIN_H */
Index: src/include/executor/nodeHash.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/executor/nodeHash.h,v
retrieving revision 1.46
diff -c -r1.46 nodeHash.h
*** src/include/executor/nodeHash.h 1 Jan 2009 17:23:59 -0000 1.46
--- src/include/executor/nodeHash.h 6 Jan 2009 23:29:18 -0000
***************
*** 45,48 ****
--- 45,50 ----
int *numbuckets,
int *numbatches);
+ extern int ExecHashGetIMBucket(HashJoinTable hashtable, uint32 hashvalue);
+
#endif /* NODEHASH_H */
Index: src/include/nodes/execnodes.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/nodes/execnodes.h,v
retrieving revision 1.201
diff -c -r1.201 execnodes.h
*** src/include/nodes/execnodes.h 12 Jan 2009 05:10:45 -0000 1.201
--- src/include/nodes/execnodes.h 12 Jan 2009 20:29:58 -0000
***************
*** 1389,1394 ****
--- 1389,1395 ----
* hj_NeedNewOuter true if need new outer tuple on next call
* hj_MatchedOuter true if found a join match for current outer
* hj_OuterNotEmpty true if outer relation known not empty
+ * hj_OuterTupleIMBucketNo IM bucket# for the current outer tuple
* ----------------
*/
***************
*** 1414,1419 ****
--- 1415,1421 ----
bool hj_NeedNewOuter;
bool hj_MatchedOuter;
bool hj_OuterNotEmpty;
+ int hj_OuterTupleIMBucketNo;
} HashJoinState;
Index: src/include/nodes/primnodes.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/nodes/primnodes.h,v
retrieving revision 1.145
diff -c -r1.145 primnodes.h
*** src/include/nodes/primnodes.h 1 Jan 2009 17:24:00 -0000 1.145
--- src/include/nodes/primnodes.h 7 Jan 2009 05:48:16 -0000
***************
*** 121,128 ****
* subplans; for example, in a join node varno becomes INNER or OUTER and
* varattno becomes the index of the proper element of that subplan's target
* list. But varnoold/varoattno continue to hold the original values.
! * The code doesn't really need varnoold/varoattno, but they are very useful
! * for debugging and interpreting completed plans, so we keep them around.
*/
#define INNER 65000
#define OUTER 65001
--- 121,132 ----
* subplans; for example, in a join node varno becomes INNER or OUTER and
* varattno becomes the index of the proper element of that subplan's target
* list. But varnoold/varoattno continue to hold the original values.
! *
! * For the most part, the code doesn't really need varnoold/varoattno, but
! * they are very useful for debugging and interpreting completed plans, so we
! * keep them around. As of PostgreSQL 8.4, these values are also used by
! * ExecHashJoinDetectSkew to fetch MCV statistics when performing multi-batch
! * hash joins.
*/
#define INNER 65000
#define OUTER 65001
***************
*** 142,148 ****
Index varlevelsup; /* for subquery variables referencing outer
* relations; 0 in a normal var, >0 means N
* levels up */
! Index varnoold; /* original value of varno, for debugging */
AttrNumber varoattno; /* original value of varattno */
int location; /* token location, or -1 if unknown */
} Var;
--- 146,152 ----
Index varlevelsup; /* for subquery variables referencing outer
* relations; 0 in a normal var, >0 means N
* levels up */
! Index varnoold; /* original value of varno */
AttrNumber varoattno; /* original value of varattno */
int location; /* token location, or -1 if unknown */
} Var;
Index: src/include/optimizer/cost.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/optimizer/cost.h,v
retrieving revision 1.96
diff -c -r1.96 cost.h
*** src/include/optimizer/cost.h 7 Jan 2009 22:40:49 -0000 1.96
--- src/include/optimizer/cost.h 13 Jan 2009 08:02:06 -0000
***************
*** 59,64 ****
--- 59,65 ----
extern bool enable_nestloop;
extern bool enable_mergejoin;
extern bool enable_hashjoin;
+ extern bool enable_hashjoin_usestatmcvs;
extern int constraint_exclusion;
extern double clamp_row_est(double nrows);