diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 3ef7cfb..fa0e700a 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -49,6 +49,9 @@ static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);
 
 static void *dense_alloc(HashJoinTable hashtable, Size size);
 
+/* Memory needed for buckets. */
+#define BUCKETS_SPACE(htab)	((htab)->nbuckets * sizeof(HashJoinTuple))
+
 /* ----------------------------------------------------------------
  *		ExecHash
  *
@@ -386,7 +389,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
  */
 
 /* Target bucket loading (tuples per bucket) */
-#define NTUP_PER_BUCKET			10
+#define NTUP_PER_BUCKET			1
 
 void
 ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
@@ -396,6 +399,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 {
 	int			tupsize;
 	double		inner_rel_bytes;
+	long		buckets_bytes;
 	long		hash_table_bytes;
 	long		skew_table_bytes;
 	long		max_pointers;
@@ -418,6 +422,16 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	inner_rel_bytes = ntuples * tupsize;
 
 	/*
+	 * Compute memory occupied by buckets, assuming all tuples fit into
+	 * a single batch (consider NTUP_PER_BUCKET tuples per bucket) - buckets
+	 * are just 'HashJoinTuple' elements (pointers to HashJoinTupleData).
+	 * Also, we never use less than 1024 buckets.
+	 */
+	nbuckets = (1 << my_log2(ntuples / NTUP_PER_BUCKET));
+	nbuckets = Max(1024, nbuckets);
+	buckets_bytes = sizeof(HashJoinTuple) * nbuckets;
+
+	/*
 	 * Target in-memory hashtable size is work_mem kilobytes.
 	 */
 	hash_table_bytes = work_mem * 1024L;
@@ -468,16 +482,13 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	/* also ensure we avoid integer overflow in nbatch and nbuckets */
 	max_pointers = Min(max_pointers, INT_MAX / 2);
 
-	if (inner_rel_bytes > hash_table_bytes)
+	if ((inner_rel_bytes + buckets_bytes) > hash_table_bytes)
 	{
 		/* We'll need multiple batches */
 		long		lbuckets;
 		double		dbatch;
 		int			minbatch;
-
-		lbuckets = (hash_table_bytes / tupsize) / NTUP_PER_BUCKET;
-		lbuckets = Min(lbuckets, max_pointers);
-		nbuckets = (int) lbuckets;
+		long		bucket_size;
 
 		dbatch = ceil(inner_rel_bytes / hash_table_bytes);
 		dbatch = Min(dbatch, max_pointers);
@@ -485,6 +496,53 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		nbatch = 2;
 		while (nbatch < minbatch)
 			nbatch <<= 1;
+
+		/* memory needed by a single "full" bucket (including tuples) */
+		bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));
+
+		/*
+		 * When batching, we use buckets size for full work_mem. We simply
+		 * divide work_mem by memory needed per 'full bucket' (a pointer and
+		 * NTUP_PER_BUCKET tuples, each 'tupsize' bytes, including additional
+		 * overhead for hash, pointer to the next tuple etc.).
+		 */
+		lbuckets = 1 << my_log2(hash_table_bytes / bucket_size);
+
+		/* protect against nbucket overflow */
+		lbuckets = Min(lbuckets, max_pointers);
+		nbuckets = (int) lbuckets;
+
+		/* Compute memory needed for buckets. */
+		buckets_bytes = nbuckets * sizeof(HashJoinTuple);
+
+		/*
+		 * Buckets are simple pointers to hashjoin tuples, while tupsize is
+		 * always >= 24B, plus actual data. So buckets should never really
+		 * exceed 25% of work_mem (even for NTUP_PER_BUCKET=1). Except maybe
+		 * for work_mem values that are not 2^N bytes, where we might get more
+		 * because of doubling. So let's look for 50% here.
+		 */
+		Assert(buckets_bytes <= hash_table_bytes/2);
+
+		/*
+		 * Subtract the buckets from work_mem, so we know how much memory
+		 * remains for the actual tuples.
+		 */
+		hash_table_bytes -= buckets_bytes;
+
+		/*
+		 * Increase the nbatch until we get both tuples and buckets into work_mem.
+		 *
+		 * The loop should not execute more than once in most cases, becase tuples are
+		 * usually much wider than buckets (usually 8B pointers), so by using only
+		 * (batch_bytes/2) should get us below work_mem.
+		 *
+		 * The worst case is that (nbuckets == 2*ntuples-1), giving us about twice the
+		 * number of buckets, i.e. about 2*sizeof(void*) per tuple. But that's
+		 * the consequence of how NTUP_PER_BUCKET is chosen, and work_mem limit.
+		 */
+		while ((inner_rel_bytes / nbatch) > hash_table_bytes)
+			nbatch <<= 1;
 	}
 	else
 	{
@@ -754,7 +812,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		hashtable->spaceUsed += hashTupleSize;
 		if (hashtable->spaceUsed > hashtable->spacePeak)
 			hashtable->spacePeak = hashtable->spaceUsed;
-		if (hashtable->spaceUsed > hashtable->spaceAllowed)
+		if (hashtable->spaceUsed + BUCKETS_SPACE(hashtable) > hashtable->spaceAllowed)
 			ExecHashIncreaseNumBatches(hashtable);
 	}
 	else