diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 0a2562c149..00f80e4f34 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2809,6 +2809,96 @@ cost_agg(Path *path, PlannerInfo *root,
 	path->total_cost = total_cost;
 }
 
+/*
+ * get_windowclause_startup_tuples
+ *		Determine how many tuples we'll need to fetch from a WindowAgg's
+ *		subnode before we can output the first WindowAgg tuple.
+ */
+static double
+get_windowclause_startup_tuples(PlannerInfo *root, WindowClause *wc,
+								double input_tuples)
+{
+	double partition_tuples;
+
+	/* Figure out how many partitions there are likely to be */
+	if (wc->partitionClause != NIL)
+	{
+		double	num_partitions;
+		List   *partexprs = get_sortgrouplist_exprs(wc->partitionClause,
+													root->parse->targetList);
+
+		num_partitions = estimate_num_groups(root, partexprs, input_tuples,
+											 NULL, NULL);
+		list_free(partexprs);
+
+		partition_tuples = input_tuples / num_partitions;
+	}
+	else
+	{
+		/* all tuples belong to the same partition */
+		partition_tuples = input_tuples;
+	}
+
+	/*
+	 * An empty ORDER BY means all rows are peers and WindowAgg will process
+	 * the entire partition for the first tuple.
+	 */
+	if (wc->orderClause == NIL)
+		return clamp_row_est(partition_tuples);
+
+	/*
+	 * BETWEEN ... AND CURRENT ROW means the executor only needs to read at
+	 * most 1 row from the subnode before outputting the first tuple.
+	 */
+	if (wc->frameOptions & FRAMEOPTION_END_CURRENT_ROW)
+		return 1.0;
+
+	/*
+	 * BETWEEN ... AND UNBOUNDED FOLLOWING means we need to read the entire
+	 * partition before outputting the first tuple.
+	 */
+	if (wc->frameOptions & FRAMEOPTION_END_UNBOUNDED_FOLLOWING)
+		return clamp_row_est(partition_tuples);
+
+	/*
+	 * BETWEEN ... AND N PRECEDING does not need to read from the subnode for
+	 * the first N tuples.  To be safe, we return 1.0.  The endOffset could be
+	 * 0.
+	 */
+	if (wc->frameOptions & FRAMEOPTION_END_OFFSET_PRECEDING)
+		return 1.0;
+
+	/*
+	 * BETWEEN ... AND N FOLLOWING is more complex.  We need to try to
+	 * determine the value of endOffset.
+	 */
+	if (wc->frameOptions & FRAMEOPTION_END_OFFSET_FOLLOWING &&
+		IsA(wc->endOffset, Const))
+	{
+		Const *endOffset = (Const *) wc->endOffset;
+
+		if (endOffset->constisnull)
+		{
+			/*
+			 * This should raise an ERROR during execution, but we'd better do
+			 * something sane here.
+			 */
+			return 1.0;
+		}
+
+		/* XXX do we expect any other type of Const? */
+		else if (endOffset->consttype == INT4OID)
+			return clamp_row_est(Min(partition_tuples,
+									 (double) DatumGetInt32(endOffset->constvalue)));
+		else if (endOffset->consttype == INT8OID)
+			return clamp_row_est(Min(partition_tuples,
+									 (double) DatumGetInt64(endOffset->constvalue)));
+	}
+
+	/* Just assume we'll need DEFAULT_INEQ_SEL of the partition's tuples */
+	return clamp_row_est(partition_tuples * DEFAULT_INEQ_SEL);
+}
+
 /*
  * cost_windowagg
  *		Determines and returns the cost of performing a WindowAgg plan node,
@@ -2818,17 +2908,34 @@ cost_agg(Path *path, PlannerInfo *root,
  */
 void
 cost_windowagg(Path *path, PlannerInfo *root,
-			   List *windowFuncs, int numPartCols, int numOrderCols,
+			   List *windowFuncs, WindowClause *winclause,
 			   Cost input_startup_cost, Cost input_total_cost,
 			   double input_tuples)
 {
 	Cost		startup_cost;
 	Cost		total_cost;
+	double		startup_tuples;
+	int			numPartCols;
+	int			numOrderCols;
 	ListCell   *lc;
 
+	numPartCols = list_length(winclause->partitionClause);
+	numOrderCols = list_length(winclause->orderClause);
+
 	startup_cost = input_startup_cost;
 	total_cost = input_total_cost;
 
+	/*
+	 * Estimate how many tuples we'll need to read from the subnode before we
+	 * can output the first WindowAgg row.
+	 *
+	 * XXX is it worth caching this value in the WindowClause to prevent
+	 * having to calculate it for each path we generate?  The number of
+	 * input_tuples could vary from path to path, however.
+	 */
+	startup_tuples = get_windowclause_startup_tuples(root, winclause,
+													 input_tuples);
+
 	/*
 	 * Window functions are assumed to cost their stated execution cost, plus
 	 * the cost of evaluating their input expressions, per tuple.  Since they
@@ -2880,6 +2987,18 @@ cost_windowagg(Path *path, PlannerInfo *root,
 	path->rows = input_tuples;
 	path->startup_cost = startup_cost;
 	path->total_cost = total_cost;
+
+	/*
+	 * Also take into account how many tuples we need to read from the subnode
+	 * in order to produce the first tuple from the WindowAgg.  To do this we
+	 * proportion the run cost (total cost not including startup costs) over
+	 * the estimated startup tuples.  We already included the startup cost of
+	 * the subnode, so we only need to do this when the estimated startup
+	 * tuples is above 1.0.
+	 */
+	if (startup_tuples > 1.0)
+		path->startup_cost += (total_cost - startup_cost) / input_tuples *
+							  (startup_tuples - 1.0);
 }
 
 /*
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 65a191ebfd..8c4dbff00a 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -3445,8 +3445,7 @@ create_windowagg_path(PlannerInfo *root,
 	 */
 	cost_windowagg(&pathnode->path, root,
 				   windowFuncs,
-				   list_length(winclause->partitionClause),
-				   list_length(winclause->orderClause),
+				   winclause,
 				   subpath->startup_cost,
 				   subpath->total_cost,
 				   subpath->rows);
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 6cf49705d3..bee090ffc2 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -131,7 +131,7 @@ extern void cost_agg(Path *path, PlannerInfo *root,
 					 Cost input_startup_cost, Cost input_total_cost,
 					 double input_tuples, double input_width);
 extern void cost_windowagg(Path *path, PlannerInfo *root,
-						   List *windowFuncs, int numPartCols, int numOrderCols,
+						   List *windowFuncs, WindowClause *winclause,
 						   Cost input_startup_cost, Cost input_total_cost,
 						   double input_tuples);
 extern void cost_group(Path *path, PlannerInfo *root,
diff --git a/src/test/regress/expected/window.out b/src/test/regress/expected/window.out
index 1d4b78b9b2..2628033327 100644
--- a/src/test/regress/expected/window.out
+++ b/src/test/regress/expected/window.out
@@ -4808,6 +4808,80 @@ SELECT i, b, bool_and(b) OVER w, bool_or(b) OVER w
  5 | t | t        | t
 (5 rows)
 
+--
+-- Test WindowAgg costing takes into account the number of rows that need to
+-- be fetched before the first row can be output.
+--
+-- Ensure we get a cheap start up plan as the WindowAgg can output the first
+-- row after reading 1 row from the join.
+EXPLAIN (COSTS OFF)
+SELECT COUNT(*) OVER (ORDER BY t1.unique1)
+FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous
+LIMIT 1;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Limit
+   ->  WindowAgg
+         ->  Nested Loop
+               ->  Index Only Scan using tenk1_unique1 on tenk1 t1
+               ->  Index Only Scan using tenk1_thous_tenthous on tenk1 t2
+                     Index Cond: (tenthous = t1.unique1)
+(6 rows)
+
+-- Ensure we get a cheap total plan.  Lack of ORDER BY in the WindowClause
+-- means that all rows must be read from the join, so a cheap startup plan
+-- isn't a good choice.
+EXPLAIN (COSTS OFF)
+SELECT COUNT(*) OVER ()
+FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous
+LIMIT 1;
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Limit
+   ->  WindowAgg
+         ->  Hash Join
+               Hash Cond: (t1.unique1 = t2.tenthous)
+               ->  Index Only Scan using tenk1_unique1 on tenk1 t1
+               ->  Hash
+                     ->  Index Only Scan using tenk1_thous_tenthous on tenk1 t2
+(7 rows)
+
+-- Ensure we get a cheap total plan.  This time use UNBOUNDED FOLLOWING, which
+-- needs to read all join rows to output the first WindowAgg row.
+EXPLAIN (COSTS OFF)
+SELECT COUNT(*) OVER (ORDER BY t1.unique1 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
+FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous
+LIMIT 1;
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Limit
+   ->  WindowAgg
+         ->  Merge Join
+               Merge Cond: (t1.unique1 = t2.tenthous)
+               ->  Index Only Scan using tenk1_unique1 on tenk1 t1
+               ->  Sort
+                     Sort Key: t2.tenthous
+                     ->  Index Only Scan using tenk1_thous_tenthous on tenk1 t2
+(8 rows)
+
+-- Ensure we get a cheap total plan.  This time use 10000 FOLLOWING so we need
+-- to read all join rows.
+EXPLAIN (COSTS OFF)
+SELECT COUNT(*) OVER (ORDER BY t1.unique1 ROWS BETWEEN UNBOUNDED PRECEDING AND 10000 FOLLOWING)
+FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous
+LIMIT 1;
+                                   QUERY PLAN                                   
+--------------------------------------------------------------------------------
+ Limit
+   ->  WindowAgg
+         ->  Merge Join
+               Merge Cond: (t1.unique1 = t2.tenthous)
+               ->  Index Only Scan using tenk1_unique1 on tenk1 t1
+               ->  Sort
+                     Sort Key: t2.tenthous
+                     ->  Index Only Scan using tenk1_thous_tenthous on tenk1 t2
+(8 rows)
+
 -- Tests for problems with failure to walk or mutate expressions
 -- within window frame clauses.
 -- test walker (fails with collation error if expressions are not walked)
diff --git a/src/test/regress/sql/window.sql b/src/test/regress/sql/window.sql
index 3ab6ac715d..4789de0937 100644
--- a/src/test/regress/sql/window.sql
+++ b/src/test/regress/sql/window.sql
@@ -1716,6 +1716,40 @@ SELECT i, b, bool_and(b) OVER w, bool_or(b) OVER w
   FROM (VALUES (1,true), (2,true), (3,false), (4,false), (5,true)) v(i,b)
   WINDOW w AS (ORDER BY i ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING);
 
+--
+-- Test WindowAgg costing takes into account the number of rows that need to
+-- be fetched before the first row can be output.
+--
+
+-- Ensure we get a cheap start up plan as the WindowAgg can output the first
+-- row after reading 1 row from the join.
+EXPLAIN (COSTS OFF)
+SELECT COUNT(*) OVER (ORDER BY t1.unique1)
+FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous
+LIMIT 1;
+
+-- Ensure we get a cheap total plan.  Lack of ORDER BY in the WindowClause
+-- means that all rows must be read from the join, so a cheap startup plan
+-- isn't a good choice.
+EXPLAIN (COSTS OFF)
+SELECT COUNT(*) OVER ()
+FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous
+LIMIT 1;
+
+-- Ensure we get a cheap total plan.  This time use UNBOUNDED FOLLOWING, which
+-- needs to read all join rows to output the first WindowAgg row.
+EXPLAIN (COSTS OFF)
+SELECT COUNT(*) OVER (ORDER BY t1.unique1 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
+FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous
+LIMIT 1;
+
+-- Ensure we get a cheap total plan.  This time use 10000 FOLLOWING so we need
+-- to read all join rows.
+EXPLAIN (COSTS OFF)
+SELECT COUNT(*) OVER (ORDER BY t1.unique1 ROWS BETWEEN UNBOUNDED PRECEDING AND 10000 FOLLOWING)
+FROM tenk1 t1 INNER JOIN tenk1 t2 ON t1.unique1 = t2.tenthous
+LIMIT 1;
+
 -- Tests for problems with failure to walk or mutate expressions
 -- within window frame clauses.