diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index c9aa921..53cdd09 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -274,9 +274,10 @@ static Bitmapset *find_unaggregated_cols(AggState *aggstate);
 static bool find_unaggregated_cols_walker(Node *node, Bitmapset **colnos);
 static void build_hash_table(AggState *aggstate);
 static AggHashEntry lookup_hash_entry(AggState *aggstate,
-				  TupleTableSlot *inputslot);
+				  TupleTableSlot *inputslot, bool *isnew);
 static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
 static void agg_fill_hash_table(AggState *aggstate);
+static TupleTableSlot *agg_fill_hash_and_retrieve(AggState *aggstate);
 static TupleTableSlot *agg_retrieve_hash_table(AggState *aggstate);
 static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
 
@@ -920,12 +921,11 @@ hash_agg_entry_size(int numAggs)
  * When called, CurrentMemoryContext should be the per-query context.
  */
 static AggHashEntry
-lookup_hash_entry(AggState *aggstate, TupleTableSlot *inputslot)
+lookup_hash_entry(AggState *aggstate, TupleTableSlot *inputslot, bool *isnew)
 {
 	TupleTableSlot *hashslot = aggstate->hashslot;
 	ListCell   *l;
 	AggHashEntry entry;
-	bool		isnew;
 
 	/* if first time through, initialize hashslot by cloning input slot */
 	if (hashslot->tts_tupleDescriptor == NULL)
@@ -948,9 +948,9 @@ lookup_hash_entry(AggState *aggstate, TupleTableSlot *inputslot)
 	/* find or create the hashtable entry using the filtered tuple */
 	entry = (AggHashEntry) LookupTupleHashEntry(aggstate->hashtable,
 												hashslot,
-												&isnew);
+												isnew);
 
-	if (isnew)
+	if (*isnew)
 	{
 		/* initialize aggregates for new tuple group */
 		initialize_aggregates(aggstate, aggstate->peragg, entry->pergroup);
@@ -1004,7 +1004,12 @@ ExecAgg(AggState *node)
 	if (((Agg *) node->ss.ps.plan)->aggstrategy == AGG_HASHED)
 	{
 		if (!node->table_filled)
-			agg_fill_hash_table(node);
+		{
+			if (node->numaggs)
+				agg_fill_hash_table(node);
+			else
+				return agg_fill_hash_and_retrieve(node);
+		}
 		return agg_retrieve_hash_table(node);
 	}
 	else
@@ -1222,6 +1227,7 @@ agg_fill_hash_table(AggState *aggstate)
 	ExprContext *tmpcontext;
 	AggHashEntry entry;
 	TupleTableSlot *outerslot;
+	bool 		isnew;
 
 	/*
 	 * get state info from node
@@ -1243,7 +1249,7 @@ agg_fill_hash_table(AggState *aggstate)
 		tmpcontext->ecxt_outertuple = outerslot;
 
 		/* Find or build hashtable entry for this tuple's group */
-		entry = lookup_hash_entry(aggstate, outerslot);
+		entry = lookup_hash_entry(aggstate, outerslot, &isnew);
 
 		/* Advance the aggregates */
 		advance_aggregates(aggstate, entry->pergroup);
@@ -1258,6 +1264,101 @@ agg_fill_hash_table(AggState *aggstate)
 }
 
 /*
+ * ExecAgg for hashed case: phase 1, read input and build hash table
+ * return found tuples immediately.
+ */
+static TupleTableSlot *
+agg_fill_hash_and_retrieve(AggState *aggstate)
+{
+	PlanState  *outerPlan;
+	ExprContext *tmpcontext;
+	AggHashEntry entry;
+	TupleTableSlot *outerslot;
+	bool		isnew;
+	ExprContext *econtext;
+	TupleTableSlot *firstSlot;
+
+	/*
+	 * get state info from node
+	 */
+	outerPlan = outerPlanState(aggstate);
+	/* tmpcontext is the per-input-tuple expression context */
+	tmpcontext = aggstate->tmpcontext;
+
+	econtext = aggstate->ss.ps.ps_ExprContext;
+	firstSlot = aggstate->ss.ss_ScanTupleSlot;
+
+	Assert(aggstate->numaggs == 0);
+
+	/*
+	 * Process each outer-plan tuple, and then fetch the next one, until we
+	 * exhaust the outer plan.
+	 */
+	for (;;)
+	{
+		outerslot = ExecProcNode(outerPlan);
+		if (TupIsNull(outerslot))
+		{
+			aggstate->table_filled = true;
+			/* Initialize to walk the hash table */
+			ResetTupleHashIterator(aggstate->hashtable, &aggstate->hashiter);
+			return NULL;
+		}
+		/* set up for advance_aggregates call */
+		tmpcontext->ecxt_outertuple = outerslot;
+
+		/* Find or build hashtable entry for this tuple's group */
+		entry = lookup_hash_entry(aggstate, outerslot, &isnew);
+
+		/* Reset per-input-tuple context after each tuple */
+		ResetExprContext(tmpcontext);
+
+		if (isnew)
+		{
+			/*
+			 * Store the copied first input tuple in the tuple table slot reserved
+			 * for it, so that it can be used in ExecProject.
+			 */
+			ExecStoreMinimalTuple(entry->shared.firstTuple,
+								  firstSlot,
+								  false);
+
+			/*
+			 * Use the representative input tuple for any references to
+			 * non-aggregated input columns in the qual and tlist.
+			 */
+			econtext->ecxt_outertuple = firstSlot;
+
+			/*
+			 * Check the qual (HAVING clause); if the group does not match, ignore
+			 * it and loop back to try to process another group.
+			 */
+			if (ExecQual(aggstate->ss.ps.qual, econtext, false))
+			{
+				/* FIXME: copy and paste */
+				/*
+				 * Form and return a projection tuple using the aggregate results
+				 * and the representative input tuple.
+				 */
+				TupleTableSlot *result;
+				ExprDoneCond isDone;
+
+				result = ExecProject(aggstate->ss.ps.ps_ProjInfo, &isDone);
+
+				if (isDone != ExprEndResult)
+				{
+					aggstate->ss.ps.ps_TupFromTlist =
+						(isDone == ExprMultipleResult);
+					return result;
+				}
+			}
+			else
+				InstrCountFiltered1(aggstate, 1);
+		}
+	}
+}
+
+/*
  * ExecAgg for hashed case: phase 2, retrieving groups from hash table
  */
 static TupleTableSlot *
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 9cae27b..e393e5a 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1495,9 +1495,13 @@ cost_agg(Path *path, PlannerInfo *root,
 		total_cost = startup_cost + cpu_tuple_cost;
 		output_tuples = 1;
 	}
-	else if (aggstrategy == AGG_SORTED)
+	else if (aggstrategy == AGG_SORTED || aggcosts->numAggs == 0)
 	{
-		/* Here we are able to deliver output on-the-fly */
+		/*
+		 * Here we are able to deliver output on-the-fly.
+		 * If there are no aggregates, the executor can start outputing
+		 * distinct tuples as it finds them even for hashed aggregates.
+		 */
 		startup_cost = input_startup_cost;
 		total_cost = input_total_cost;
 		/* calcs phrased this way to match HASHED case, see note above */
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index dcf32c0..8bdaf59 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -2321,6 +2321,7 @@ choose_hashed_grouping(PlannerInfo *root,
 	List	   *current_pathkeys;
 	Path		hashed_p;
 	Path		sorted_p;
+	double		returnedGroups = dNumGroups;
 
 	/*
 	 * Executor doesn't support hashed aggregation with DISTINCT or ORDER BY
@@ -2362,7 +2363,14 @@ choose_hashed_grouping(PlannerInfo *root,
 	/* plus the per-hash-entry overhead */
 	hashentrysize += hash_agg_entry_size(agg_costs->numAggs);
 
-	if (hashentrysize * dNumGroups > work_mem * 1024L)
+	/* We don't need the whole hashtable if we can return rows on the fly */
+	if (tuple_fraction >= 1.0)
+		tuple_fraction /= dNumGroups;
+
+	if (!parse->hasAggs)
+		returnedGroups = tuple_fraction * dNumGroups;
+
+	if (hashentrysize * returnedGroups > work_mem * 1024L)
 		return false;
 
 	/*
@@ -2444,9 +2452,6 @@ choose_hashed_grouping(PlannerInfo *root,
 	 * Now make the decision using the top-level tuple fraction.  First we
 	 * have to convert an absolute count (LIMIT) into fractional form.
 	 */
-	if (tuple_fraction >= 1.0)
-		tuple_fraction /= dNumGroups;
-
 	if (compare_fractional_path_costs(&hashed_p, &sorted_p,
 									  tuple_fraction) < 0)
 	{
@@ -2528,7 +2533,10 @@ choose_hashed_distinct(PlannerInfo *root,
 	 */
 	hashentrysize = MAXALIGN(path_width) + MAXALIGN(sizeof(MinimalTupleData));
 
-	if (hashentrysize * dNumDistinctRows > work_mem * 1024L)
+	if (tuple_fraction >= 1.0)
+		tuple_fraction /= dNumDistinctRows;
+
+	if (hashentrysize * dNumDistinctRows * tuple_fraction > work_mem * 1024L)
 		return false;
 
 	/*
@@ -2595,9 +2603,6 @@ choose_hashed_distinct(PlannerInfo *root,
 	 * Now make the decision using the top-level tuple fraction.  First we
 	 * have to convert an absolute count (LIMIT) into fractional form.
 	 */
-	if (tuple_fraction >= 1.0)
-		tuple_fraction /= dNumDistinctRows;
-
 	if (compare_fractional_path_costs(&hashed_p, &sorted_p,
 									  tuple_fraction) < 0)
 	{
