Help with Join Performance Testing

Started by Lawrence, Ramonalmost 17 years ago1 messages
ramon.lawrence@ubc.ca
almost 17 years ago
2 attachment(s)
A hash join modification patch is under review for 8.4 that needs
performance testing. We would appreciate help with this testing.
A testing version of the patch is attached in addition to testing
instructions and where to retrieve a sample data set. The basic idea
of the patch is that it reduces disk operations for large multi-batch
hash joins where there is skew in the probe relation. The patch
collects statistics on performance benefits when using the optimization.
Ramon Lawrence and Bryce Cutt
Attachments:

README_Testing.txttext/plain; name=README_Testing.txtDownload
histojoin_testing.patchapplication/octet-stream; name=histojoin_testing.patchDownload
Index: src/backend/executor/nodeHash.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeHash.c,v
retrieving revision 1.117
diff -c -r1.117 nodeHash.c
*** src/backend/executor/nodeHash.c	1 Jan 2009 17:23:41 -0000	1.117
--- src/backend/executor/nodeHash.c	14 Jan 2009 06:36:51 -0000
***************
*** 53,58 ****
--- 53,224 ----
  	return NULL;
  }
  
+ /*
+  * ----------------------------------------------------------------
+  *		ExecHashGetIMBucket
+  *
+  *  	Returns the index of the in-memory bucket for this
+  *		hashvalue, or IM_INVALID_BUCKET if the hashvalue is not
+  *		associated with any unfrozen bucket (or if skew
+  *		optimization is not being used).
+  *  
+  *		It is possible for a tuple whose join attribute value is
+  *		not a MCV to hash to an in-memory bucket due to the limited
+  * 		number of hash values but it is unlikely and everything
+  *		continues to work even if it does happen. We would
+  *		accidentally cache some less optimal tuples in memory
+  *		but the join result would still be accurate.
+  *
+  *		hashtable->imBucket is an open addressing hashtable of
+  *		in-memory buckets (HashJoinIMBucket).
+  * ----------------------------------------------------------------
+  */
+ int 
+ ExecHashGetIMBucket(HashJoinTable hashtable, uint32 hashvalue)
+ {
+ 	int bucket;
+ 
+ 	if (!hashtable->enableSkewOptimization)
+ 		return IM_INVALID_BUCKET;
+ 	
+ 	/* Modulo the hashvalue (using bitmask) to find the IM bucket. */
+ 	bucket = hashvalue & (hashtable->nIMBuckets - 1);
+ 
+ 	/*
+ 	 * While we have not hit a hole in the hashtable and have not hit the 
+ 	 * actual bucket we have collided in the hashtable so try the next
+ 	 * bucket location.
+ 	 */
+ 	while (hashtable->imBucket[bucket] != NULL
+ 		&& hashtable->imBucket[bucket]->hashvalue != hashvalue)
+ 		bucket = (bucket + 1) & (hashtable->nIMBuckets - 1);
+ 
+ 	/*
+ 	 * If the bucket exists and has been correctly determined return
+ 	 * the bucket index.
+ 	 */
+ 	if (hashtable->imBucket[bucket] != NULL
+ 		&& hashtable->imBucket[bucket]->hashvalue == hashvalue)
+ 		return bucket;
+ 
+ 	/*
+ 	 * Must have run into an empty location or a frozen bucket which means the
+ 	 * tuple with this hashvalue is not to be handled as if it matches with an
+ 	 * in-memory bucket.
+ 	 */
+ 	return IM_INVALID_BUCKET;
+ }
+ 
+ /*
+  * ----------------------------------------------------------------
+  *		ExecHashFreezeNextIMBucket
+  *
+  *		Freeze the tuples of the next in-memory bucket by pushing
+  *		them into the main hashtable.  Buckets are frozen in order
+  *		so that the best tuples are kept in memory the longest.
+  * ----------------------------------------------------------------
+  */
+ static bool 
+ ExecHashFreezeNextIMBucket(HashJoinTable hashtable)
+ {
+ 	int bucketToFreeze;
+ 	int bucketno;
+ 	int batchno;
+ 	uint32 hashvalue;
+ 	HashJoinTuple hashTuple;
+ 	HashJoinTuple nextHashTuple;
+ 	HashJoinIMBucket *bucket;
+ 	MinimalTuple mintuple;
+ 
+ 	/* Calculate the imBucket index of the bucket to freeze. */
+ 	bucketToFreeze = hashtable->imBucketFreezeOrder
+ 		[hashtable->nUsedIMBuckets - 1 - hashtable->nIMBucketsFrozen];
+ 
+ 	/* Grab a pointer to the actual IM bucket. */
+ 	bucket = hashtable->imBucket[bucketToFreeze];
+ 	hashvalue = bucket->hashvalue;
+ 
+ 	/*
+ 	 * Grab a pointer to the first tuple in the soon to be frozen IM bucket.
+ 	 */
+ 	hashTuple = bucket->tuples;
+ 
+ 	/*
+ 	 * Calculate which bucket and batch the tuples belong to in the main
+ 	 * non-IM hashtable.
+ 	 */
+ 	ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);
+ 
+ 	/* until we have read all tuples from this bucket */
+ 	while (hashTuple != NULL)
+ 	{
+ 		/*
+ 		 * Some of this code is very similar to that of ExecHashTableInsert.
+ 		 * We do not call ExecHashTableInsert directly as
+ 		 * ExecHashTableInsert expects a TupleTableSlot and we already have
+ 		 * HashJoinTuples.
+ 		 */
+ 		mintuple = HJTUPLE_MINTUPLE(hashTuple);
+ 
+ 		hashtable->nInnerIMTupFrozen++;
+ 
+ 		/* Decide whether to put the tuple in the hash table or a temp file. */
+ 		if (batchno == hashtable->curbatch)
+ 		{
+ 			/* Put the tuple in hash table. */
+ 			nextHashTuple = hashTuple->next;
+ 			hashTuple->next = hashtable->buckets[bucketno];
+ 			hashtable->buckets[bucketno] = hashTuple;
+ 			hashTuple = nextHashTuple;
+ 			hashtable->spaceUsedIM -= HJTUPLE_OVERHEAD + mintuple->t_len;
+ 		}
+ 		else
+ 		{
+ 			/* Put the tuples into a temp file for later batches. */
+ 			Assert(batchno > hashtable->curbatch);
+ 			ExecHashJoinSaveTuple(mintuple, hashvalue,
+ 								  &hashtable->innerBatchFile[batchno]);
+ 			/*
+ 			 * Some memory has been freed up. This must be done before we
+ 			 * pfree the hashTuple of we lose access to the tuple size.
+ 			 */
+ 			hashtable->spaceUsed -= HJTUPLE_OVERHEAD + mintuple->t_len;
+ 			hashtable->spaceUsedIM -= HJTUPLE_OVERHEAD + mintuple->t_len;
+ 			nextHashTuple = hashTuple->next;
+ 			pfree(hashTuple);
+ 			hashTuple = nextHashTuple;
+ 		}
+ 	}
+ 
+ 	/*
+ 	 * Free the memory the bucket struct was using as it is not necessary
+ 	 * any more.  All code treats a frozen in-memory bucket the same as one
+ 	 * that did not exist; by setting the pointer to null the rest of the code
+ 	 * will function as if we had not created this bucket at all.
+ 	 */
+ 	pfree(bucket);
+ 	hashtable->imBucket[bucketToFreeze] = NULL;
+ 	hashtable->spaceUsed -= IM_BUCKET_OVERHEAD;
+ 	hashtable->spaceUsedIM -= IM_BUCKET_OVERHEAD;
+ 	hashtable->nIMBucketsFrozen++;
+ 
+ 	/*
+ 	 * All buckets have been frozen and deleted from memory so turn off
+ 	 * skew aware partitioning and remove the structs from memory as they are
+ 	 * just wasting space from now on.
+ 	 */
+ 	if (hashtable->nUsedIMBuckets == hashtable->nIMBucketsFrozen)
+ 	{
+ 		hashtable->enableSkewOptimization = false;
+ 		pfree(hashtable->imBucket);
+ 		pfree(hashtable->imBucketFreezeOrder);
+ 		hashtable->spaceUsed -= hashtable->spaceUsedIM;
+ 		hashtable->spaceUsedIM = 0;
+ 	}
+ 
+ 	return true;
+ }
+ 
  /* ----------------------------------------------------------------
   *		MultiExecHash
   *
***************
*** 69,74 ****
--- 235,242 ----
  	TupleTableSlot *slot;
  	ExprContext *econtext;
  	uint32		hashvalue;
+ 	MinimalTuple mintuple;
+ 	int bucketNumber;
  
  	/* must provide our own instrumentation support */
  	if (node->ps.instrument)
***************
*** 99,106 ****
  		if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, false,
  								 &hashvalue))
  		{
! 			ExecHashTableInsert(hashtable, slot, hashvalue);
! 			hashtable->totalTuples += 1;
  		}
  	}
  
--- 267,322 ----
  		if (ExecHashGetHashValue(hashtable, econtext, hashkeys, false, false,
  								 &hashvalue))
  		{
! 			int bucketno;
! 			int batchno;
! 
! 			bucketNumber = ExecHashGetIMBucket(hashtable, hashvalue);
! 
! 			ExecHashGetBucketAndBatch(hashtable, hashvalue,
! 									  &bucketno, &batchno);
! 			if (batchno == 0)
! 			{
! 				hashtable->nInnerPotentialBatchZeroTup++;
! 				if (bucketNumber == IM_INVALID_BUCKET)
! 					hashtable->nInnerBatchZeroTup++;
! 			}
! 
! 			/* handle tuples not destined for an in-memory bucket normally */
! 			if (bucketNumber == IM_INVALID_BUCKET)
! 				ExecHashTableInsert(hashtable, slot, hashvalue);
! 			else
! 			{
! 				HashJoinTuple hashTuple;
! 				int			hashTupleSize;
! 				
! 				/* get the HashJoinTuple */
! 				mintuple = ExecFetchSlotMinimalTuple(slot);
! 				hashTupleSize = HJTUPLE_OVERHEAD + mintuple->t_len;
! 				hashTuple = (HashJoinTuple)
! 					MemoryContextAlloc(hashtable->batchCxt, hashTupleSize);
! 				hashTuple->hashvalue = hashvalue;
! 				memcpy(HJTUPLE_MINTUPLE(hashTuple), mintuple, mintuple->t_len);
! 
! 				/* Push the HashJoinTuple onto the front of the IM bucket. */
! 				hashTuple->next = hashtable->imBucket[bucketNumber]->tuples;
! 				hashtable->imBucket[bucketNumber]->tuples = hashTuple;
! 				
! 				/*
! 				 * More memory is now in use so make sure we are not over
! 				 * spaceAllowedIM.
! 				 */
! 				hashtable->spaceUsed += hashTupleSize;
! 				hashtable->spaceUsedIM += hashTupleSize;
! 				hashtable->nInnerIMTup++;
! 				while (hashtable->spaceUsedIM > hashtable->spaceAllowedIM
! 					&& ExecHashFreezeNextIMBucket(hashtable))
! 					;
! 				/* Guarantee we are not over the spaceAllowed. */
! 				if (hashtable->spaceUsed > hashtable->spaceAllowed)
! 					ExecHashIncreaseNumBatches(hashtable);
! 			}
! 			hashtable->totalTuples++;
! 			hashtable->nInnerTup++;
  		}
  	}
  
***************
*** 269,274 ****
--- 485,512 ----
  	hashtable->outerBatchFile = NULL;
  	hashtable->spaceUsed = 0;
  	hashtable->spaceAllowed = work_mem * 1024L;
+ 	/* Initialize skew optimization related hashtable variables. */
+ 	hashtable->spaceUsedIM = 0;
+ 	hashtable->spaceAllowedIM
+ 		= hashtable->spaceAllowed * IM_WORK_MEM_PERCENT / 100;
+ 	hashtable->enableSkewOptimization = false;
+ 	hashtable->nUsedIMBuckets = 0;
+ 	hashtable->nIMBuckets = 0;
+ 	hashtable->imBucket = NULL;
+ 	hashtable->nIMBucketsFrozen = 0;
+ 	hashtable->nInnerTup = 0;
+ 	hashtable->nOuterTup = 0;
+ 	hashtable->nInnerIMTup = 0;
+ 	hashtable->nOuterIMTup = 0;
+ 	hashtable->nOutputTup = 0;
+ 	hashtable->nOutputIMTup = 0;
+ 	hashtable->nInnerIMTupFrozen = 0;
+ 	hashtable->nOuterBatchZeroTup = 0;
+ 	hashtable->nOuterPotentialBatchZeroTup = 0;
+ 	hashtable->nOutputBatchZeroTup = 0;
+ 	hashtable->nOutputPotentialBatchZeroTup = 0;
+ 	hashtable->nInnerBatchZeroTup = 0;
+ 	hashtable->nInnerPotentialBatchZeroTup = 0;
  
  	/*
  	 * Get info about the hash functions to be used for each hash key. Also
***************
*** 448,453 ****
--- 686,754 ----
  {
  	int			i;
  
+ 	/* the total # of inner tuples received by join */
+ 	ereport(NOTICE, (errmsg("Total Inner Tuples: %d", hashtable->nInnerTup)));
+ 	/* # inner tuples in the IM buckets */
+ 	ereport(NOTICE, (errmsg("IM Inner Tuples: %d", hashtable->nInnerIMTup)));
+ 	/* # inner tuples that fell in batch 0 */
+ 	ereport(NOTICE, (errmsg("Batch Zero Inner Tuples: %d", 
+ 		hashtable->nInnerBatchZeroTup)));
+ 	/*
+ 	 * # inner tuples that would have fallen in batch 0 if IM buckets were
+ 	 * not in use at all
+ 	 */
+ 	ereport(NOTICE, (errmsg("Batch Zero Potential Inner Tuples: %d", 
+ 		hashtable->nInnerPotentialBatchZeroTup)));
+ 	/* total # of outer tuples received by join */
+ 	ereport(NOTICE, (errmsg("Total Outer Tuples: %d", hashtable->nOuterTup)));
+ 	/* # outer tuples that matched with the IM buckets */
+ 	ereport(NOTICE, (errmsg("IM Outer Tuples: %d", hashtable->nOuterIMTup)));
+ 	/* # outer tuples that matched with batch 0 */
+ 	ereport(NOTICE, (errmsg("Batch Zero Outer Tuples: %d", 
+ 		hashtable->nOuterBatchZeroTup)));
+ 	/*
+ 	 * # outer tuples that would have fallen in batch 0 if IM buckets were
+ 	 * not in use at all
+ 	 */
+ 	ereport(NOTICE, (errmsg("Batch Zero Potential Outer Tuples: %d", 
+ 		hashtable->nOuterPotentialBatchZeroTup)));
+ 	/* total # output tuples produced by join */
+ 	ereport(NOTICE, (errmsg("Total Output Tuples: %d", 
+ 		hashtable->nOutputTup)));
+ 	/* # output tuples that came from matches with IM bucket inner tuples */
+ 	ereport(NOTICE, (errmsg("IM Output Tuples: %d", 
+ 		hashtable->nOutputIMTup)));
+ 	/* # output tuples that came from matches with batch 0 inner tuples */
+ 	ereport(NOTICE, (errmsg("Batch Zero Output Tuples: %d", 
+ 		hashtable->nOutputBatchZeroTup)));
+ 	/*
+ 	 * # output tuples that would have came from matches with batch 0 if IM
+ 	 * buckets were not in use at all
+ 	 */
+ 	ereport(NOTICE, (errmsg("Batch Zero Potential Output Tuples: %d", 
+ 		hashtable->nOutputPotentialBatchZeroTup)));
+ 	/*
+ 	 * # of inner IM tuples that were frozen back to the main hashtable when
+ 	 * an IM bucket was frozen
+ 	 */
+ 	ereport(NOTICE, (errmsg("IM Tuples Frozen: %d", 
+ 		hashtable->nInnerIMTupFrozen)));
+ 	/* percentage less tuple IOs compared to HHJ due to skew optimization */
+ 	if (hashtable->nInnerTup - hashtable->nInnerPotentialBatchZeroTup
+ 			+ hashtable->nOuterTup - hashtable->nOuterPotentialBatchZeroTup
+ 			!= 0)
+ 		ereport(NOTICE, (errmsg("Percentage less tuple IOs than HHJ: %4.2f", 
+ 			(1 - (
+ 			(float)(hashtable->nInnerTup - hashtable->nInnerIMTup
+ 				- hashtable->nInnerBatchZeroTup 
+ 				+ hashtable->nOuterTup - hashtable->nOuterIMTup
+ 				- hashtable->nOuterBatchZeroTup)
+ 			/
+ 			(hashtable->nInnerTup - hashtable->nInnerPotentialBatchZeroTup
+ 				+ hashtable->nOuterTup
+ 				- hashtable->nOuterPotentialBatchZeroTup))
+ 			) * 100)));
+ 
  	/*
  	 * Make sure all the temp files are closed.  We skip batch 0, since it
  	 * can't have any temp files (and the arrays might not even exist if
***************
*** 815,825 ****
  	/*
  	 * hj_CurTuple is NULL to start scanning a new bucket, or the address of
  	 * the last tuple returned from the current bucket.
  	 */
! 	if (hashTuple == NULL)
! 		hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
! 	else
  		hashTuple = hashTuple->next;
  
  	while (hashTuple != NULL)
  	{
--- 1116,1132 ----
  	/*
  	 * hj_CurTuple is NULL to start scanning a new bucket, or the address of
  	 * the last tuple returned from the current bucket.
+ 	 *
+ 	 * If the tuple hashed to an IM bucket then scan the IM bucket
+ 	 * otherwise scan the standard hashtable bucket.
  	 */
! 	if (hashTuple != NULL)
  		hashTuple = hashTuple->next;
+ 	else if (hjstate->hj_OuterTupleIMBucketNo != IM_INVALID_BUCKET)
+ 		hashTuple = hashtable->imBucket[hjstate->hj_OuterTupleIMBucketNo]
+ 			->tuples;
+ 	else
+ 		hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
  
  	while (hashTuple != NULL)
  	{
Index: src/backend/executor/nodeHashjoin.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v
retrieving revision 1.97
diff -c -r1.97 nodeHashjoin.c
*** src/backend/executor/nodeHashjoin.c	1 Jan 2009 17:23:41 -0000	1.97
--- src/backend/executor/nodeHashjoin.c	13 Jan 2009 21:28:41 -0000
***************
*** 20,25 ****
--- 20,30 ----
  #include "executor/nodeHash.h"
  #include "executor/nodeHashjoin.h"
  #include "utils/memutils.h"
+ #include "utils/syscache.h"
+ #include "utils/lsyscache.h"
+ #include "parser/parsetree.h"
+ #include "catalog/pg_statistic.h"
+ #include "optimizer/cost.h"
  
  
  /* Returns true for JOIN_LEFT and JOIN_ANTI jointypes */
***************
*** 34,39 ****
--- 39,246 ----
  						  TupleTableSlot *tupleSlot);
  static int	ExecHashJoinNewBatch(HashJoinState *hjstate);
  
+ /*
+  * ----------------------------------------------------------------
+  *		ExecHashJoinDetectSkew
+  *
+  *		If MCV statistics can be found for the join attribute of
+  *		this hashjoin then create a hash table of buckets. Each
+  *		bucket will correspond to an MCV hashvalue and will be
+  *		filled with inner relation tuples whose join attribute
+  *		hashes to the same value as that MCV.  If a join
+  *		attribute value is a MCV for the join attribute in the
+  *		outer (probe) relation, tuples with this value in the
+  *		inner (build) relation are more likely to join with outer
+  *		relation tuples and a benefit can be gained by keeping
+  *		them in memory while joining the first batch of tuples.
+  * ----------------------------------------------------------------
+  */
+ static void 
+ ExecHashJoinDetectSkew(EState *estate, HashJoinState *hjstate, int tupwidth)
+ {
+ 	HeapTupleData	*statsTuple;
+ 	FuncExprState	*clause;
+ 	ExprState		*argstate;
+ 	Var				*variable;
+ 	HashJoinTable	hashtable;
+ 	Datum			*values;
+ 	int				nvalues;
+ 	float4			*numbers;
+ 	int				nnumbers;
+ 	Oid				relid;
+ 	Oid				atttype;
+ 	int				i;
+ 	int				mcvsToUse;
+ 
+ 	PlanState  *outerNode = outerPlanState(hjstate);
+ 
+ 	/* Only use statistics if there is a single join attribute. */
+ 	if (hjstate->hashclauses->length != 1)
+ 		return; /* Histojoin is not defined for more than one join key */
+ 	
+ 	hashtable = hjstate->hj_HashTable;
+ 	
+ 	/*
+ 	 * Estimate the number of IM buckets that will fit in
+ 	 * the memory allowed for IM buckets.
+ 	 *
+ 	 * hashtable->imBucket will have up to 8 times as many HashJoinIMBucket
+ 	 * pointers as the number of MCV hashvalues. A uint16 index in
+ 	 * hashtable->imBucketFreezeOrder will be created for each IM bucket. One
+ 	 * actual HashJoinIMBucket struct will be created for each
+ 	 * unique MCV hashvalue so up to one struct per MCV.
+ 	 *
+ 	 * It is also estimated that each IM bucket will have a single build
+ 	 * tuple stored in it after partitioning the build relation input.  This
+ 	 * estimate could be high if tuples are filtered out before this join but
+ 	 * in that case the extra memory is used by the regular hashjoin batch.
+ 	 * This estimate could be low if it is a many to many join but in that
+ 	 * case IM buckets will be frozen to free up memory as needed
+ 	 * during the inner relation partitioning phase.
+ 	 */
+ 	mcvsToUse = hashtable->spaceAllowedIM / (
+ 		/* size of a hash tuple */
+ 		HJTUPLE_OVERHEAD + MAXALIGN(sizeof(MinimalTupleData))
+ 			+ MAXALIGN(tupwidth)
+ 		/* max size of hashtable pointers per MCV */
+ 		+ (8 * sizeof(HashJoinIMBucket*))
+ 		+ sizeof(uint16) /* size of imBucketFreezeOrder entry */
+ 		+ IM_BUCKET_OVERHEAD /* size of IM bucket struct */
+ 		);
+ 	if (mcvsToUse == 0)
+ 		return;	/* No point in considering this any further. */
+ 
+ 	/*
+ 	 * Determine the relation id and attribute id of the single join
+ 	 * attribute of the probe relation.
+ 	 */
+ 	clause = (FuncExprState *) lfirst(list_head(hjstate->hashclauses));
+ 	argstate = (ExprState *) lfirst(list_head(clause->args));
+ 
+ 	/*
+ 	 * Do not try to exploit stats if the join attribute is an expression
+ 	 * instead of just a simple attribute.
+ 	 */		
+ 	if (argstate->expr->type != T_Var)
+ 		return;
+ 
+ 	variable = (Var *) argstate->expr;
+ 	relid = getrelid(variable->varnoold, estate->es_range_table);
+ 	atttype = variable->vartype;
+ 
+ 	statsTuple = SearchSysCache(STATRELATT, ObjectIdGetDatum(relid),
+ 		Int16GetDatum(variable->varoattno), 0, 0);
+ 	if (!HeapTupleIsValid(statsTuple))
+ 		return;
+ 
+ 	/* Look for MCV statistics for the attribute. */
+ 	if (get_attstatsslot(statsTuple, atttype, variable->vartypmod,
+ 		STATISTIC_KIND_MCV, InvalidOid, &values, &nvalues,
+ 		&numbers, &nnumbers))
+ 	{
+ 		FmgrInfo   *hashfunctions;
+ 		int nbuckets = 2;
+ 		double frac = 0;
+ 		
+ 		/*
+ 		 * IM buckets (imBucket) is an open addressing hashtable with a 
+ 		 * power of 2 size that is greater than the number of MCV values.
+ 		 */
+ 		if (mcvsToUse > nvalues)
+ 			mcvsToUse = nvalues;
+ 			
+ 		for (i = 0; i < mcvsToUse; i++)
+ 			frac += numbers[i];
+ 
+ 		ereport(NOTICE, (errmsg("Values: %d Skew: %4.2f  Est. tuples: %4.2f Batches: %d  Est. Save: %4.2f",
+                         nvalues, frac, outerNode->plan->plan_rows, hashtable->nbatch, 
+ 			(frac*(1-1.0/hashtable->nbatch)*outerNode->plan->plan_rows))));
+ 		
+ 		if (frac < IM_MIN_BENEFIT_PERCENT)
+ 		{
+ 			free_attstatsslot(atttype, values, nvalues, numbers, nnumbers);
+ 			ReleaseSysCache(statsTuple);
+ 			return;
+ 		}
+ 		
+ 		while (nbuckets <= mcvsToUse)
+ 			nbuckets <<= 1;
+ 		/* use two more bits just to help avoid collisions */
+ 		nbuckets <<= 2;
+ 		hashtable->nIMBuckets = nbuckets;
+ 		hashtable->enableSkewOptimization = true;
+ 
+ 		/*
+ 		 * Allocate the bucket memory in the hashtable's batch context
+ 		 * because it is only relevant and necessary during the first batch
+ 		 * and will be nicely removed once the first batch is done.
+ 		 */
+ 		hashtable->imBucket = 
+ 			MemoryContextAllocZero(hashtable->batchCxt,
+ 				nbuckets * sizeof(HashJoinIMBucket*));
+ 		hashtable->imBucketFreezeOrder = 
+ 			MemoryContextAllocZero(hashtable->batchCxt,
+ 				mcvsToUse * sizeof(uint16));
+ 		/* Count the overhead of the IM pointers immediately. */
+ 		hashtable->spaceUsed += nbuckets * sizeof(HashJoinIMBucket*)
+ 			+ mcvsToUse * sizeof(uint16);
+ 		hashtable->spaceUsedIM +=  nbuckets * sizeof(HashJoinIMBucket*)
+ 			+ mcvsToUse * sizeof(uint16);
+ 
+ 		/*
+ 		 * Grab the hash functions as we will be generating the hashvalues
+ 		 * in this section.
+ 		 */
+ 		hashfunctions = hashtable->outer_hashfunctions;
+ 
+ 		/* Create the buckets */
+ 		for (i = 0; i < mcvsToUse; i++)
+ 		{
+ 			uint32 hashvalue = DatumGetUInt32(
+ 				FunctionCall1(&hashfunctions[0], values[i]));
+ 			int bucket = hashvalue & (nbuckets - 1);
+ 
+ 			/*
+ 			 * While we have not hit a hole in the hashtable and have not hit
+ 			 * a bucket with the same hashvalue we have collided in the
+ 			 * hashtable so try the next bucket location (remember it is an
+ 			 * open addressing hashtable).
+ 			 */
+ 			while (hashtable->imBucket[bucket] != NULL
+ 				&& hashtable->imBucket[bucket]->hashvalue != hashvalue)
+ 				bucket = (bucket + 1) & (nbuckets - 1);
+ 
+ 			/*
+ 			 * Leave bucket alone if it has the same hashvalue as current
+ 			 * MCV. We only want one bucket per hashvalue. Even if two MCV
+ 			 * values hash to the same bucket we are fine.
+ 			 */
+ 			if (hashtable->imBucket[bucket] == NULL)
+ 			{
+ 				/*
+ 				 * Allocate the actual bucket structure in the hashtable's batch
+ 				 * context because it is only relevant and necessary during
+ 				 * the first batch and will be nicely removed once the first
+ 				 * batch is done.
+ 				 */
+ 				hashtable->imBucket[bucket]
+ 					= MemoryContextAllocZero(hashtable->batchCxt,
+ 						sizeof(HashJoinIMBucket));
+ 				hashtable->imBucket[bucket]->hashvalue = hashvalue;
+ 				hashtable->imBucketFreezeOrder[hashtable->nUsedIMBuckets]
+ 					= bucket;
+ 				hashtable->nUsedIMBuckets++;
+ 				/* Count the overhead of the IM bucket struct */
+ 				hashtable->spaceUsed += IM_BUCKET_OVERHEAD;
+ 				hashtable->spaceUsedIM += IM_BUCKET_OVERHEAD;
+ 			}
+ 		}
+ 
+ 		free_attstatsslot(atttype, values, nvalues, numbers, nnumbers);
+ 	}
+ 
+ 	ReleaseSysCache(statsTuple);
+ }
  
  /* ----------------------------------------------------------------
   *		ExecHashJoin
***************
*** 147,152 ****
--- 354,364 ----
  										node->hj_HashOperators);
  		node->hj_HashTable = hashtable;
  
+ 		/* Use skew optimization only when there is more than one batch. */
+ 		if (hashtable->nbatch > 1 && enable_hashjoin_usestatmcvs)
+ 			ExecHashJoinDetectSkew(estate, node,
+ 				(outerPlan((Hash *) hashNode->ps.plan))->plan_width);
+ 
  		/*
  		 * execute the Hash node, to build the hash table
  		 */
***************
*** 205,216 ****
  			ExecHashGetBucketAndBatch(hashtable, hashvalue,
  									  &node->hj_CurBucketNo, &batchno);
  			node->hj_CurTuple = NULL;
  
  			/*
  			 * Now we've got an outer tuple and the corresponding hash bucket,
! 			 * but this tuple may not belong to the current batch.
  			 */
! 			if (batchno != hashtable->curbatch)
  			{
  				/*
  				 * Need to postpone this outer tuple to a later batch. Save it
--- 417,440 ----
  			ExecHashGetBucketAndBatch(hashtable, hashvalue,
  									  &node->hj_CurBucketNo, &batchno);
  			node->hj_CurTuple = NULL;
+ 			
+ 			/* Does the outer tuple match an IM bucket? */
+ 			node->hj_OuterTupleIMBucketNo = 
+ 				ExecHashGetIMBucket(hashtable, hashvalue);
+ 			if (node->hj_OuterTupleIMBucketNo != IM_INVALID_BUCKET)
+ 				hashtable->nOuterIMTup++;
+ 			else if (batchno == 0)
+ 				hashtable->nOuterBatchZeroTup++;
+ 			if (batchno == 0)
+ 				hashtable->nOuterPotentialBatchZeroTup++;
  
  			/*
  			 * Now we've got an outer tuple and the corresponding hash bucket,
! 			 * but in might not belong to the current batch, or it might need
! 			 * to go into an in-memory bucket.
  			 */
! 			if (node->hj_OuterTupleIMBucketNo == IM_INVALID_BUCKET
! 				&& batchno != hashtable->curbatch)
  			{
  				/*
  				 * Need to postpone this outer tuple to a later batch. Save it
***************
*** 281,286 ****
--- 505,517 ----
  					{
  						node->js.ps.ps_TupFromTlist =
  							(isDone == ExprMultipleResult);
+ 						hashtable->nOutputTup++;
+ 						if (node->hj_OuterTupleIMBucketNo != IM_INVALID_BUCKET)
+ 							hashtable->nOutputIMTup++;
+ 						else if (batchno == 0)
+ 							hashtable->nOutputBatchZeroTup++;
+ 						if (batchno == 0)
+ 							hashtable->nOutputPotentialBatchZeroTup++;
  						return result;
  					}
  				}
***************
*** 582,587 ****
--- 813,819 ----
  			{
  				/* remember outer relation is not empty for possible rescan */
  				hjstate->hj_OuterNotEmpty = true;
+ 				hashtable->nOuterTup++;
  
  				return slot;
  			}
***************
*** 641,647 ****
  	nbatch = hashtable->nbatch;
  	curbatch = hashtable->curbatch;
  
! 	if (curbatch > 0)
  	{
  		/*
  		 * We no longer need the previous outer batch file; close it right
--- 873,899 ----
  	nbatch = hashtable->nbatch;
  	curbatch = hashtable->curbatch;
  
! 	/* if we just finished the first batch */
! 	if (curbatch == 0)
! 	{
! 		/*
! 		 * Reset some of the skew optimization state variables. IM buckets are
! 		 * no longer being used as of this point because they are only
! 		 * necessary while joining the first batch (before the cleanup phase).
! 		 *
! 		 * Especially need to make sure ExecHashGetIMBucket returns
! 		 * IM_INVALID_BUCKET quickly for all subsequent calls.
! 		 *
! 		 * IM buckets are only taking up memory if this is a multi-batch join
! 		 * and in that case ExecHashTableReset is about to be called which
! 		 * will free all memory currently used by IM buckets and tuples when
! 		 * it deletes hashtable->batchCxt.  If this is a single batch join
! 		 * then imBucket and imBucketFreezeOrder are already NULL and empty.
! 		 */
! 		hashtable->enableSkewOptimization = false;
! 		hashtable->spaceUsedIM = 0;
! 	}
! 	else if (curbatch > 0)
  	{
  		/*
  		 * We no longer need the previous outer batch file; close it right
Index: src/backend/optimizer/path/costsize.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v
retrieving revision 1.203
diff -c -r1.203 costsize.c
*** src/backend/optimizer/path/costsize.c	1 Jan 2009 17:23:43 -0000	1.203
--- src/backend/optimizer/path/costsize.c	13 Jan 2009 08:02:42 -0000
***************
*** 108,113 ****
--- 108,114 ----
  bool		enable_nestloop = true;
  bool		enable_mergejoin = true;
  bool		enable_hashjoin = true;
+ bool		enable_hashjoin_usestatmcvs = true;
  
  typedef struct
  {
Index: src/backend/utils/misc/guc.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/guc.c,v
retrieving revision 1.493
diff -c -r1.493 guc.c
*** src/backend/utils/misc/guc.c	12 Jan 2009 05:10:44 -0000	1.493
--- src/backend/utils/misc/guc.c	13 Jan 2009 08:03:33 -0000
***************
*** 656,661 ****
--- 656,669 ----
  		true, NULL, NULL
  	},
  	{
+ 		{"enable_hashjoin_usestatmcvs", PGC_USERSET, QUERY_TUNING_METHOD,
+ 			gettext_noop("Enables the hash join's use of the MCVs stored in pg_statistic."),
+ 			NULL
+ 		},
+ 		&enable_hashjoin_usestatmcvs,
+ 		true, NULL, NULL
+ 	},
+ 	{
  		{"geqo", PGC_USERSET, QUERY_TUNING_GEQO,
  			gettext_noop("Enables genetic query optimization."),
  			gettext_noop("This algorithm attempts to do planning without "
Index: src/include/executor/hashjoin.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/executor/hashjoin.h,v
retrieving revision 1.49
diff -c -r1.49 hashjoin.h
*** src/include/executor/hashjoin.h	1 Jan 2009 17:23:59 -0000	1.49
--- src/include/executor/hashjoin.h	14 Jan 2009 06:42:53 -0000
***************
*** 72,77 ****
--- 72,97 ----
  #define HJTUPLE_MINTUPLE(hjtup)  \
  	((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
  
+ /*
+  * Stores a hashvalue and linked list of tuples that share that hashvalue.
+  *
+  * When processing MCVs to detect skew in the probe relation of a hash join
+  * the hashvalue is generated and stored before any tuples have been read 
+  * (see ExecHashJoinDetectSkew).
+  *
+  * Build tuples that hash to the same hashvalue are placed in the bucket while
+  * reading the build relation.
+  */
+ typedef struct HashJoinIMBucket
+ {
+ 	uint32 hashvalue;
+ 	HashJoinTuple tuples;
+ } HashJoinIMBucket;
+ 
+ #define IM_INVALID_BUCKET -1
+ #define IM_WORK_MEM_PERCENT 2
+ #define IM_MIN_BENEFIT_PERCENT .01
+ #define IM_BUCKET_OVERHEAD MAXALIGN(sizeof(HashJoinIMBucket))
  
  typedef struct HashJoinTableData
  {
***************
*** 113,121 ****
--- 133,201 ----
  
  	Size		spaceUsed;		/* memory space currently used by tuples */
  	Size		spaceAllowed;	/* upper limit for space used */
+ 	/* memory space currently used by IM buckets and tuples */
+ 	Size		spaceUsedIM;
+ 	/* upper limit for space used by IM buckets and tuples */
+ 	Size		spaceAllowedIM;
  
  	MemoryContext hashCxt;		/* context for whole-hash-join storage */
  	MemoryContext batchCxt;		/* context for this-batch-only storage */
+ 	
+ 	/* will the join optimize memory usage when probe relation is skewed */
+ 	bool enableSkewOptimization;
+ 	HashJoinIMBucket **imBucket; /* hashtable of IM buckets */
+ 	/*
+ 	 * array of imBucket indexes to the created IM buckets sorted
+ 	 * in the opposite order that they would be frozen to disk
+ 	 */
+ 	uint16 *imBucketFreezeOrder;
+ 	int nIMBuckets; /* # of buckets in the IM buckets hashtable */
+ 	/*
+ 	 * # of used buckets in the IM buckets hashtable and length of
+ 	 * imBucketFreezeOrder array
+ 	 */
+ 	int nUsedIMBuckets;
+ 	/* # of IM buckets that have already been frozen to disk */
+ 	int nIMBucketsFrozen;
+ 
+ 	/* the total # of inner tuples received by join */
+ 	uint32 nInnerTup;
+ 	/* total # of outer tuples received by join */
+ 	uint32 nOuterTup;
+ 	/* # inner tuples in the IM buckets */
+ 	uint32 nInnerIMTup;
+ 	/* # outer tuples that matched with the IM buckets */
+ 	uint32 nOuterIMTup;
+ 	/* total # output tuples produced by join */
+ 	uint32 nOutputTup;
+ 	/* # output tuples that came from matches with IM bucket inner tuples */
+ 	uint32 nOutputIMTup;
+ 	/*
+ 	 * # of inner IM tuples that were frozen back to the main hashtable when
+ 	 * an IM bucket was frozen
+ 	 */
+ 	uint32 nInnerIMTupFrozen;
+ 	/* # outer tuples that matched with batch 0 */
+ 	uint32 nOuterBatchZeroTup;
+ 	/*
+ 	 * # outer tuples that would have fallen in batch 0 if IM buckets were
+ 	 * not in use at all
+ 	 */
+ 	uint32 nOuterPotentialBatchZeroTup;
+ 	/* # output tuples that came from matches with batch 0 inner tuples */
+ 	uint32 nOutputBatchZeroTup;
+ 	/*
+ 	 * # output tuples that would have came from matches with batch 0 if IM
+ 	 * buckets were not in use at all
+ 	 */
+ 	uint32 nOutputPotentialBatchZeroTup;
+ 	/* # inner tuples that fell in batch 0 */
+ 	uint32 nInnerBatchZeroTup;
+ 	/*
+ 	 * # inner tuples that would have fallen in batch 0 if IM buckets were
+ 	 * not in use at all
+ 	 */
+ 	uint32 nInnerPotentialBatchZeroTup;
  } HashJoinTableData;
  
  #endif   /* HASHJOIN_H */
Index: src/include/executor/nodeHash.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/executor/nodeHash.h,v
retrieving revision 1.46
diff -c -r1.46 nodeHash.h
*** src/include/executor/nodeHash.h	1 Jan 2009 17:23:59 -0000	1.46
--- src/include/executor/nodeHash.h	6 Jan 2009 23:29:18 -0000
***************
*** 45,48 ****
--- 45,50 ----
  						int *numbuckets,
  						int *numbatches);
  
+ extern int ExecHashGetIMBucket(HashJoinTable hashtable, uint32 hashvalue);
+ 
  #endif   /* NODEHASH_H */
Index: src/include/nodes/execnodes.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/nodes/execnodes.h,v
retrieving revision 1.201
diff -c -r1.201 execnodes.h
*** src/include/nodes/execnodes.h	12 Jan 2009 05:10:45 -0000	1.201
--- src/include/nodes/execnodes.h	12 Jan 2009 20:29:58 -0000
***************
*** 1389,1394 ****
--- 1389,1395 ----
   *		hj_NeedNewOuter			true if need new outer tuple on next call
   *		hj_MatchedOuter			true if found a join match for current outer
   *		hj_OuterNotEmpty		true if outer relation known not empty
+  *		hj_OuterTupleIMBucketNo	IM bucket# for the current outer tuple
   * ----------------
   */
  
***************
*** 1414,1419 ****
--- 1415,1421 ----
  	bool		hj_NeedNewOuter;
  	bool		hj_MatchedOuter;
  	bool		hj_OuterNotEmpty;
+ 	int			hj_OuterTupleIMBucketNo;
  } HashJoinState;
  
  
Index: src/include/nodes/primnodes.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/nodes/primnodes.h,v
retrieving revision 1.145
diff -c -r1.145 primnodes.h
*** src/include/nodes/primnodes.h	1 Jan 2009 17:24:00 -0000	1.145
--- src/include/nodes/primnodes.h	7 Jan 2009 05:48:16 -0000
***************
*** 121,128 ****
   * subplans; for example, in a join node varno becomes INNER or OUTER and
   * varattno becomes the index of the proper element of that subplan's target
   * list.  But varnoold/varoattno continue to hold the original values.
!  * The code doesn't really need varnoold/varoattno, but they are very useful
!  * for debugging and interpreting completed plans, so we keep them around.
   */
  #define    INNER		65000
  #define    OUTER		65001
--- 121,132 ----
   * subplans; for example, in a join node varno becomes INNER or OUTER and
   * varattno becomes the index of the proper element of that subplan's target
   * list.  But varnoold/varoattno continue to hold the original values.
!  *
!  * For the most part, the code doesn't really need varnoold/varoattno, but
!  * they are very useful for debugging and interpreting completed plans, so we
!  * keep them around.  As of PostgreSQL 8.4, these values are also used by
!  * ExecHashJoinDetectSkew to fetch MCV statistics when performing multi-batch
!  * hash joins.
   */
  #define    INNER		65000
  #define    OUTER		65001
***************
*** 142,148 ****
  	Index		varlevelsup;	/* for subquery variables referencing outer
  								 * relations; 0 in a normal var, >0 means N
  								 * levels up */
! 	Index		varnoold;		/* original value of varno, for debugging */
  	AttrNumber	varoattno;		/* original value of varattno */
  	int			location;		/* token location, or -1 if unknown */
  } Var;
--- 146,152 ----
  	Index		varlevelsup;	/* for subquery variables referencing outer
  								 * relations; 0 in a normal var, >0 means N
  								 * levels up */
! 	Index		varnoold;		/* original value of varno */
  	AttrNumber	varoattno;		/* original value of varattno */
  	int			location;		/* token location, or -1 if unknown */
  } Var;
Index: src/include/optimizer/cost.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/optimizer/cost.h,v
retrieving revision 1.96
diff -c -r1.96 cost.h
*** src/include/optimizer/cost.h	7 Jan 2009 22:40:49 -0000	1.96
--- src/include/optimizer/cost.h	13 Jan 2009 08:02:06 -0000
***************
*** 59,64 ****
--- 59,65 ----
  extern bool enable_nestloop;
  extern bool enable_mergejoin;
  extern bool enable_hashjoin;
+ extern bool enable_hashjoin_usestatmcvs;
  extern int	constraint_exclusion;
  
  extern double clamp_row_est(double nrows);