diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 332f04a..06e612a 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1936,26 +1936,33 @@ show_hash_info(HashState *hashstate, ExplainState *es)
 			ExplainPropertyLong("Original Hash Batches",
 								hashtable->nbatch_original, es);
 			ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
+			ExplainPropertyLong("Hash Tuples Per Bucket", hashtable->ntup_per_bucket, es);
+			ExplainPropertyLong("Original Tuples Per Bucket",
+								hashtable->ntup_per_bucket, es);
+			ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
 		}
 		else if (hashtable->nbatch_original != hashtable->nbatch ||
-				 hashtable->nbuckets_original != hashtable->nbuckets)
+				 hashtable->nbuckets_original != hashtable->nbuckets ||
+				 hashtable->ntup_per_bucket_original != hashtable->ntup_per_bucket)
 		{
 			appendStringInfoSpaces(es->str, es->indent * 2);
 			appendStringInfo(es->str,
-							 "Buckets: %d (originally %d)  Batches: %d (originally %d)  Memory Usage: %ldkB\n",
+							 "Buckets: %d (originally %d)  Batches: %d (originally %d)  Tuples: %d (originally %d)  Memory Usage: %ldkB\n",
 							 hashtable->nbuckets,
 							 hashtable->nbuckets_original,
 							 hashtable->nbatch,
 							 hashtable->nbatch_original,
+							 hashtable->ntup_per_bucket,
+							 hashtable->ntup_per_bucket_original,
 							 spacePeakKb);
 		}
 		else
 		{
 			appendStringInfoSpaces(es->str, es->indent * 2);
 			appendStringInfo(es->str,
-						   "Buckets: %d  Batches: %d  Memory Usage: %ldkB\n",
+						   "Buckets: %d  Batches: %d  Tuples: %d  Memory Usage: %ldkB\n",
 							 hashtable->nbuckets, hashtable->nbatch,
-							 spacePeakKb);
+							 hashtable->ntup_per_bucket, spacePeakKb);
 		}
 	}
 }
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 7c5bb77..fd4f56d 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -259,6 +259,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	Plan	   *outerNode;
 	int			nbuckets;
 	int			nbatch;
+	int			ntup;
 	int			num_skew_mcvs;
 	int			log2_nbuckets;
 	int			nkeys;
@@ -275,7 +276,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 
 	ExecChooseHashTableSize(outerNode->plan_rows, outerNode->plan_width,
 							OidIsValid(node->skewTable),
-							&nbuckets, &nbatch, &num_skew_mcvs);
+							&nbuckets, &nbatch, &ntup, &num_skew_mcvs);
 
 #ifdef HJDEBUG
 	printf("nbatch = %d, nbuckets = %d\n", nbatch, nbuckets);
@@ -320,6 +321,8 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
 	hashtable->spaceAllowedSkew =
 		hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
 	hashtable->chunks = NULL;
+	hashtable->ntup_per_bucket = ntup;
+	hashtable->ntup_per_bucket_original = ntup;
 
 	/*
 	 * Get info about the hash functions to be used for each hash key. Also
@@ -410,13 +413,15 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
  * This is exported so that the planner's costsize.c can use it.
  */
 
-/* Target bucket loading (tuples per bucket) */
+/* Target bucket loading (tuples per bucket) - keep the values 2^N */
 #define NTUP_PER_BUCKET			1
+#define MAX_NTUP_PER_BUCKET		8
 
 void
 ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 						int *numbuckets,
 						int *numbatches,
+						int *ntup,
 						int *num_skew_mcvs)
 {
 	int			tupsize;
@@ -428,6 +433,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	int			nbatch = 1;
 	int			nbuckets;
 	double		dbuckets;
+	int			ntup_per_bucket;
 
 	/* Force a plausible relation size if no info */
 	if (ntuples <= 0.0)
@@ -498,6 +504,51 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	nbuckets = Max((int) dbuckets, 1024);
 	nbuckets = 1 << my_log2(nbuckets);
 	bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
+	ntup_per_bucket = NTUP_PER_BUCKET;
+
+	/*
+	 * If there's not enough space to store the projected number of tupples,
+	 * we can try to increase the number of tuples per bucket first, up tp
+	 * MAX_NTUP_PER_BUCKET, in the hope that the smaller hash table will
+	 * fit into memory. This is likely still faster than batching,
+	 * especially with large outer tables that cause a lot of I/O.
+	 */
+	if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
+	{
+		int		shrink_factor = 1;
+		int		tmp_per_bucket = ntup_per_bucket;
+		bool	fits_into_memory = false;
+
+		/*
+		 * Cut the number of buckets in half, until we reach the maximum
+		 * allowed value (MAX_NTUP_PER_BUCKET). This makes evaluating
+		 * bucket_bytes much easier (just cut the size in half).
+		 */
+		while (tmp_per_bucket * 2 <= MAX_NTUP_PER_BUCKET)
+		{
+			shrink_factor *= 2;
+			tmp_per_bucket *= 2;
+			if (inner_rel_bytes + bucket_bytes / shrink_factor <= hash_table_bytes)
+			{
+				fits_into_memory = true;
+				break;
+			}
+		}
+
+		/*
+		 * If resizing the hash table helps, we need to keep track of the
+		 * number of buckets, limit, etc.
+		 */
+		if (fits_into_memory)
+		{
+			elog(WARNING, "hash table fits into memory with shrink factor = %d",
+				shrink_factor);
+
+			nbuckets     /= shrink_factor;
+			bucket_bytes /= shrink_factor;
+			ntup_per_bucket = tmp_per_bucket;
+		}
+	}
 
 	/*
 	 * If there's not enough space to store the projected number of tuples
@@ -544,6 +595,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 
 	*numbuckets = nbuckets;
 	*numbatches = nbatch;
+	*ntup = ntup_per_bucket;
 }
 
 
@@ -610,6 +662,78 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 		   nbatch, (unsigned long) hashtable->spaceUsed);
 #endif
 
+	/*
+	 * In a batching mode always shoot for NTUP_PER_BUCKET, without any
+	 * graceful degradation (allowing more tuples instead of batching).
+	 * If that's the case (can only happen once, on the first split),
+	 * then we need to compute the number of buckets for the batching
+	 * mode, using the same logic as in ExecChooseHashTableSize().
+	 *
+	 * At this point we don't know know the estimated tuple width etc.
+	 * But we do know how much space we're using and how many tuples
+	 * there are, so we can comptute the tuple size (maybe even more
+	 * accurately).
+	 *
+	 * The question is whether this increase of bucket space (we're
+	 * lowering NTUP_PER_BUCKET, thus requiring more space for the
+	 * buckets) can result in more than doubling the number of batches
+	 * in a single step. And the answer is no, because even if we
+	 * increase the number of buckets to the optimum value (with a load
+	 * factor 1), it still can't occupy more than 50% of work_mem, just
+	 * like the tuples after increasing the number of batches. So we're
+	 * still just doubling the number of batches.
+	 *
+	 * XXX Could this reasoning be broken by computing a different
+	 *     tuple width estimate?
+	 *
+	 * XXX We have already increased nbatch.
+	 */
+	if (hashtable->ntup_per_bucket != NTUP_PER_BUCKET)
+	{
+		long		lbuckets;
+		long		bucket_size;
+		long		tupsize;
+
+		long		hash_table_bytes = work_mem * 1024L;
+		long		max_pointers = (work_mem * 1024L) / sizeof(void *);
+
+		long		bucket_bytes;
+		long		nbuckets;
+
+		Assert(oldnbatch == 1);
+
+		/* use the optiomal number of tuples per bucket */
+		hashtable->ntup_per_bucket = NTUP_PER_BUCKET;
+
+		/* compute size of a single tuple (including overhead) */
+		tupsize = ceil((double)hashtable->spaceUsed / hashtable->totalTuples);
+
+		/*
+		 * Estimate the number of buckets we'll want to have when work_mem
+		 * is entirely full.  Each bucket will contain a bucket pointer plus
+		 * NTUP_PER_BUCKET tuples, whose projected size already includes
+		 * overhead for the hash code, pointer to the next tuple, etc.
+		 */
+		bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));
+		lbuckets = 1 << my_log2(hash_table_bytes / bucket_size);
+		lbuckets = Min(lbuckets, max_pointers);
+		nbuckets = (int) lbuckets;
+		bucket_bytes = nbuckets * sizeof(HashJoinTuple);
+
+		/*
+		 * Buckets are simple pointers to hashjoin tuples, while tupsize
+		 * includes the pointer, hash code, and MinimalTupleData.  So buckets
+		 * should never really exceed 25% of work_mem (even for
+		 * NTUP_PER_BUCKET=1); except maybe * for work_mem values that are
+		 * not 2^N bytes, where we might get more * because of doubling.
+		 * So let's look for 50% here.
+		 */
+		Assert(bucket_bytes <= hash_table_bytes / 2);
+
+		hashtable->nbuckets_optimal = nbuckets;
+		hashtable->log2_nbuckets_optimal = my_log2(nbuckets);
+	}
+
 	oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
 
 	if (hashtable->innerBatchFile == NULL)
@@ -873,7 +997,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		 */
 		if ((hashtable->nbatch == 1) &&
 			(hashtable->nbuckets_optimal <= INT_MAX/2) &&	/* overflow protection */
-			(ntuples >= (hashtable->nbuckets_optimal * NTUP_PER_BUCKET)))
+			(ntuples >= (hashtable->nbuckets_optimal * hashtable->ntup_per_bucket)))
 		{
 			hashtable->nbuckets_optimal *= 2;
 			hashtable->log2_nbuckets_optimal += 1;
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 659daa2..ef6e0f7 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2420,6 +2420,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 	int			num_hashclauses = list_length(hashclauses);
 	int			numbuckets;
 	int			numbatches;
+	int			ntup;
 	int			num_skew_mcvs;
 
 	/* cost of source data */
@@ -2456,6 +2457,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 							true,		/* useskew */
 							&numbuckets,
 							&numbatches,
+							&ntup,
 							&num_skew_mcvs);
 
 	/*
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 0e1e0cd..b7eefac 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -131,6 +131,9 @@ typedef struct HashJoinTableData
 	int			nbuckets_optimal;	/* optimal # buckets (per batch) */
 	int			log2_nbuckets_optimal;	/* same as log2_nbuckets optimal */
 
+	int			ntup_per_bucket;	/* number of tuples per bucket */
+	int			ntup_per_bucket_original;	/* number of tuples per bucket */
+
 	/* buckets[i] is head of list of tuples in i'th in-memory bucket */
 	struct HashJoinTupleData **buckets;
 	/* buckets array is per-batch storage, as are all the tuples */
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index 75be5bd..a96ad90 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -47,6 +47,7 @@ extern void ExecHashTableResetMatchFlags(HashJoinTable hashtable);
 extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 						int *numbuckets,
 						int *numbatches,
+						int *ntup,
 						int *num_skew_mcvs);
 extern int	ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue);