diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 05d8538..c2cdd79 100644
*** a/src/backend/nodes/copyfuncs.c
--- b/src/backend/nodes/copyfuncs.c
*************** _copyRestrictInfo(const RestrictInfo *fr
*** 2070,2075 ****
--- 2070,2077 ----
  	COPY_SCALAR_FIELD(hashjoinoperator);
  	COPY_SCALAR_FIELD(left_bucketsize);
  	COPY_SCALAR_FIELD(right_bucketsize);
+ 	COPY_SCALAR_FIELD(left_mcvfreq);
+ 	COPY_SCALAR_FIELD(right_mcvfreq);
  
  	return newnode;
  }
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index d01630f..f88c1c5 100644
*** a/src/backend/optimizer/path/costsize.c
--- b/src/backend/optimizer/path/costsize.c
*************** final_cost_hashjoin(PlannerInfo *root, H
*** 2800,2805 ****
--- 2800,2806 ----
  	double		hashjointuples;
  	double		virtualbuckets;
  	Selectivity innerbucketsize;
+ 	Selectivity innermcvfreq;
  	ListCell   *hcl;
  
  	/* Mark the path with the correct row estimate */
*************** final_cost_hashjoin(PlannerInfo *root, H
*** 2827,2835 ****
  	virtualbuckets = (double) numbuckets *(double) numbatches;
  
  	/*
! 	 * Determine bucketsize fraction for inner relation.  We use the smallest
! 	 * bucketsize estimated for any individual hashclause; this is undoubtedly
! 	 * conservative.
  	 *
  	 * BUT: if inner relation has been unique-ified, we can assume it's good
  	 * for hashing.  This is important both because it's the right answer, and
--- 2828,2836 ----
  	virtualbuckets = (double) numbuckets *(double) numbatches;
  
  	/*
! 	 * Determine bucketsize fraction and MCV frequency for the inner relation.
! 	 * We use the smallest bucketsize or MCV frequency estimated for any
! 	 * individual hashclause; this is undoubtedly conservative.
  	 *
  	 * BUT: if inner relation has been unique-ified, we can assume it's good
  	 * for hashing.  This is important both because it's the right answer, and
*************** final_cost_hashjoin(PlannerInfo *root, H
*** 2837,2850 ****
--- 2838,2856 ----
  	 * non-unique-ified paths.
  	 */
  	if (IsA(inner_path, UniquePath))
+ 	{
  		innerbucketsize = 1.0 / virtualbuckets;
+ 		innermcvfreq = 0.0;
+ 	}
  	else
  	{
  		innerbucketsize = 1.0;
+ 		innermcvfreq = 1.0;
  		foreach(hcl, hashclauses)
  		{
  			RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(hcl);
  			Selectivity thisbucketsize;
+ 			Selectivity thismcvfreq;
  
  			Assert(IsA(restrictinfo, RestrictInfo));
  
*************** final_cost_hashjoin(PlannerInfo *root, H
*** 2853,2860 ****
  			 * is the inner side.
  			 *
  			 * Since we tend to visit the same clauses over and over when
! 			 * planning a large query, we cache the bucketsize estimate in the
! 			 * RestrictInfo node to avoid repeated lookups of statistics.
  			 */
  			if (bms_is_subset(restrictinfo->right_relids,
  							  inner_path->parent->relids))
--- 2859,2866 ----
  			 * is the inner side.
  			 *
  			 * Since we tend to visit the same clauses over and over when
! 			 * planning a large query, we cache the bucket stats estimates in
! 			 * the RestrictInfo node to avoid repeated lookups of statistics.
  			 */
  			if (bms_is_subset(restrictinfo->right_relids,
  							  inner_path->parent->relids))
*************** final_cost_hashjoin(PlannerInfo *root, H
*** 2864,2875 ****
  				if (thisbucketsize < 0)
  				{
  					/* not cached yet */
! 					thisbucketsize =
! 						estimate_hash_bucketsize(root,
  										   get_rightop(restrictinfo->clause),
! 												 virtualbuckets);
  					restrictinfo->right_bucketsize = thisbucketsize;
  				}
  			}
  			else
  			{
--- 2870,2884 ----
  				if (thisbucketsize < 0)
  				{
  					/* not cached yet */
! 					estimate_hash_bucket_stats(root,
  										   get_rightop(restrictinfo->clause),
! 											   virtualbuckets,
! 											   &thismcvfreq,
! 											   &thisbucketsize);
  					restrictinfo->right_bucketsize = thisbucketsize;
+ 					restrictinfo->right_mcvfreq = thismcvfreq;
  				}
+ 				thismcvfreq = restrictinfo->right_mcvfreq;
  			}
  			else
  			{
*************** final_cost_hashjoin(PlannerInfo *root, H
*** 2880,2899 ****
  				if (thisbucketsize < 0)
  				{
  					/* not cached yet */
! 					thisbucketsize =
! 						estimate_hash_bucketsize(root,
  											get_leftop(restrictinfo->clause),
! 												 virtualbuckets);
  					restrictinfo->left_bucketsize = thisbucketsize;
  				}
  			}
  
  			if (innerbucketsize > thisbucketsize)
  				innerbucketsize = thisbucketsize;
  		}
  	}
  
  	/*
  	 * Compute cost of the hashquals and qpquals (other restriction clauses)
  	 * separately.
  	 */
--- 2889,2926 ----
  				if (thisbucketsize < 0)
  				{
  					/* not cached yet */
! 					estimate_hash_bucket_stats(root,
  											get_leftop(restrictinfo->clause),
! 											   virtualbuckets,
! 											   &thismcvfreq,
! 											   &thisbucketsize);
  					restrictinfo->left_bucketsize = thisbucketsize;
+ 					restrictinfo->left_mcvfreq = thismcvfreq;
  				}
+ 				thismcvfreq = restrictinfo->left_mcvfreq;
  			}
  
  			if (innerbucketsize > thisbucketsize)
  				innerbucketsize = thisbucketsize;
+ 			if (innermcvfreq > thismcvfreq)
+ 				innermcvfreq = thismcvfreq;
  		}
  	}
  
  	/*
+ 	 * If the bucket holding the inner MCV would exceed work_mem, we don't
+ 	 * want to hash unless there is really no other alternative, so apply
+ 	 * disable_cost.  (The executor normally copes with excessive memory usage
+ 	 * by splitting batches, but obviously it cannot separate equal values
+ 	 * that way, so it will be unable to drive the batch size below work_mem
+ 	 * when this is true.)
+ 	 */
+ 	if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
+ 						   inner_path->pathtarget->width) >
+ 		(work_mem * 1024L))
+ 		startup_cost += disable_cost;
+ 
+ 	/*
  	 * Compute cost of the hashquals and qpquals (other restriction clauses)
  	 * separately.
  	 */
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 06e843d..f6ddb2e 100644
*** a/src/backend/optimizer/prep/prepunion.c
--- b/src/backend/optimizer/prep/prepunion.c
*************** adjust_appendrel_attrs_mutator(Node *nod
*** 1961,1966 ****
--- 1961,1968 ----
  		newinfo->scansel_cache = NIL;
  		newinfo->left_bucketsize = -1;
  		newinfo->right_bucketsize = -1;
+ 		newinfo->left_mcvfreq = -1;
+ 		newinfo->right_mcvfreq = -1;
  
  		return (Node *) newinfo;
  	}
diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c
index 045b5cf..e7efe36 100644
*** a/src/backend/optimizer/util/restrictinfo.c
--- b/src/backend/optimizer/util/restrictinfo.c
*************** make_restrictinfo_internal(Expr *clause,
*** 199,204 ****
--- 199,206 ----
  
  	restrictinfo->left_bucketsize = -1;
  	restrictinfo->right_bucketsize = -1;
+ 	restrictinfo->left_mcvfreq = -1;
+ 	restrictinfo->right_mcvfreq = -1;
  
  	return restrictinfo;
  }
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index d14f0f9..816cb65 100644
*** a/src/backend/utils/adt/selfuncs.c
--- b/src/backend/utils/adt/selfuncs.c
*************** estimate_num_groups(PlannerInfo *root, L
*** 3521,3529 ****
  }
  
  /*
!  * Estimate hash bucketsize fraction (ie, number of entries in a bucket
!  * divided by total tuples in relation) if the specified expression is used
!  * as a hash key.
   *
   * XXX This is really pretty bogus since we're effectively assuming that the
   * distribution of hash keys will be the same after applying restriction
--- 3521,3536 ----
  }
  
  /*
!  * Estimate hash bucket statistics when the specified expression is used
!  * as a hash key for the given number of buckets.
!  *
!  * This attempts to determine two values:
!  *
!  * 1. The frequency of the most common value of the expression (returns
!  * zero into *mcv_freq if we can't get that).
!  *
!  * 2. The "bucketsize fraction", ie, average number of entries in a bucket
!  * divided by total tuples in relation.
   *
   * XXX This is really pretty bogus since we're effectively assuming that the
   * distribution of hash keys will be the same after applying restriction
*************** estimate_num_groups(PlannerInfo *root, L
*** 3549,3563 ****
   * discourage use of a hash rather strongly if the inner relation is large,
   * which is what we want.  We do not want to hash unless we know that the
   * inner rel is well-dispersed (or the alternatives seem much worse).
   */
! Selectivity
! estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
  {
  	VariableStatData vardata;
  	double		estfract,
  				ndistinct,
  				stanullfrac,
- 				mcvfreq,
  				avgfreq;
  	bool		isdefault;
  	float4	   *numbers;
--- 3556,3577 ----
   * discourage use of a hash rather strongly if the inner relation is large,
   * which is what we want.  We do not want to hash unless we know that the
   * inner rel is well-dispersed (or the alternatives seem much worse).
+  *
+  * The caller should also check that the mcv_freq is not so large that the
+  * most common value would by itself require an impractically large bucket.
+  * In a hash join, the executor can split buckets if they get too big, but
+  * obviously that doesn't help for a bucket that contains many duplicates of
+  * the same value.
   */
! void
! estimate_hash_bucket_stats(PlannerInfo *root, Node *hashkey, double nbuckets,
! 						   Selectivity *mcv_freq,
! 						   Selectivity *bucketsize_frac)
  {
  	VariableStatData vardata;
  	double		estfract,
  				ndistinct,
  				stanullfrac,
  				avgfreq;
  	bool		isdefault;
  	float4	   *numbers;
*************** estimate_hash_bucketsize(PlannerInfo *ro
*** 3565,3578 ****
  
  	examine_variable(root, hashkey, 0, &vardata);
  
  	/* Get number of distinct values */
  	ndistinct = get_variable_numdistinct(&vardata, &isdefault);
  
! 	/* If ndistinct isn't real, punt and return 0.1, per comments above */
  	if (isdefault)
  	{
  		ReleaseVariableStats(vardata);
! 		return (Selectivity) 0.1;
  	}
  
  	/* Get fraction that are null */
--- 3579,3618 ----
  
  	examine_variable(root, hashkey, 0, &vardata);
  
+ 	/* Look up the frequency of the most common value, if available */
+ 	*mcv_freq = 0.0;
+ 
+ 	if (HeapTupleIsValid(vardata.statsTuple))
+ 	{
+ 		if (get_attstatsslot(vardata.statsTuple,
+ 							 vardata.atttype, vardata.atttypmod,
+ 							 STATISTIC_KIND_MCV, InvalidOid,
+ 							 NULL,
+ 							 NULL, NULL,
+ 							 &numbers, &nnumbers))
+ 		{
+ 			/*
+ 			 * The first MCV stat is for the most common value.
+ 			 */
+ 			if (nnumbers > 0)
+ 				*mcv_freq = numbers[0];
+ 			free_attstatsslot(vardata.atttype, NULL, 0,
+ 							  numbers, nnumbers);
+ 		}
+ 	}
+ 
  	/* Get number of distinct values */
  	ndistinct = get_variable_numdistinct(&vardata, &isdefault);
  
! 	/*
! 	 * If ndistinct isn't real, punt.  We normally return 0.1, but if the
! 	 * mcv_freq is known to be even higher than that, use it instead.
! 	 */
  	if (isdefault)
  	{
+ 		*bucketsize_frac = (Selectivity) Max(0.1, *mcv_freq);
  		ReleaseVariableStats(vardata);
! 		return;
  	}
  
  	/* Get fraction that are null */
*************** estimate_hash_bucketsize(PlannerInfo *ro
*** 3614,3647 ****
  		estfract = 1.0 / ndistinct;
  
  	/*
- 	 * Look up the frequency of the most common value, if available.
- 	 */
- 	mcvfreq = 0.0;
- 
- 	if (HeapTupleIsValid(vardata.statsTuple))
- 	{
- 		if (get_attstatsslot(vardata.statsTuple,
- 							 vardata.atttype, vardata.atttypmod,
- 							 STATISTIC_KIND_MCV, InvalidOid,
- 							 NULL,
- 							 NULL, NULL,
- 							 &numbers, &nnumbers))
- 		{
- 			/*
- 			 * The first MCV stat is for the most common value.
- 			 */
- 			if (nnumbers > 0)
- 				mcvfreq = numbers[0];
- 			free_attstatsslot(vardata.atttype, NULL, 0,
- 							  numbers, nnumbers);
- 		}
- 	}
- 
- 	/*
  	 * Adjust estimated bucketsize upward to account for skewed distribution.
  	 */
! 	if (avgfreq > 0.0 && mcvfreq > avgfreq)
! 		estfract *= mcvfreq / avgfreq;
  
  	/*
  	 * Clamp bucketsize to sane range (the above adjustment could easily
--- 3654,3663 ----
  		estfract = 1.0 / ndistinct;
  
  	/*
  	 * Adjust estimated bucketsize upward to account for skewed distribution.
  	 */
! 	if (avgfreq > 0.0 && *mcv_freq > avgfreq)
! 		estfract *= *mcv_freq / avgfreq;
  
  	/*
  	 * Clamp bucketsize to sane range (the above adjustment could easily
*************** estimate_hash_bucketsize(PlannerInfo *ro
*** 3653,3661 ****
  	else if (estfract > 1.0)
  		estfract = 1.0;
  
! 	ReleaseVariableStats(vardata);
  
! 	return (Selectivity) estfract;
  }
  
  
--- 3669,3677 ----
  	else if (estfract > 1.0)
  		estfract = 1.0;
  
! 	*bucketsize_frac = (Selectivity) estfract;
  
! 	ReleaseVariableStats(vardata);
  }
  
  
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index f7ac6f6..2b2d950 100644
*** a/src/include/nodes/relation.h
--- b/src/include/nodes/relation.h
*************** typedef struct RestrictInfo
*** 1697,1702 ****
--- 1697,1704 ----
  	/* cache space for hashclause processing; -1 if not yet set */
  	Selectivity left_bucketsize;	/* avg bucketsize of left side */
  	Selectivity right_bucketsize;		/* avg bucketsize of right side */
+ 	Selectivity left_mcvfreq;	/* most common val's freq on left side */
+ 	Selectivity right_mcvfreq;	/* most common val's freq on right side */
  } RestrictInfo;
  
  /*
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 9f9d2dc..03ed27b 100644
*** a/src/include/utils/selfuncs.h
--- b/src/include/utils/selfuncs.h
*************** extern void mergejoinscansel(PlannerInfo
*** 204,211 ****
  extern double estimate_num_groups(PlannerInfo *root, List *groupExprs,
  					double input_rows, List **pgset);
  
! extern Selectivity estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey,
! 						 double nbuckets);
  
  extern List *deconstruct_indexquals(IndexPath *path);
  extern void genericcostestimate(PlannerInfo *root, IndexPath *path,
--- 204,213 ----
  extern double estimate_num_groups(PlannerInfo *root, List *groupExprs,
  					double input_rows, List **pgset);
  
! extern void estimate_hash_bucket_stats(PlannerInfo *root,
! 						   Node *hashkey, double nbuckets,
! 						   Selectivity *mcv_freq,
! 						   Selectivity *bucketsize_frac);
  
  extern List *deconstruct_indexquals(IndexPath *path);
  extern void genericcostestimate(PlannerInfo *root, IndexPath *path,
