diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 129fc3d..83cb70c 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -27,9 +27,15 @@
 #include "optimizer/paths.h"
 #include "optimizer/planmain.h"
 #include "optimizer/var.h"
+#include "optimizer/clauses.h"
+#include "parser/parsetree.h"
+#include "optimizer/tlist.h"
+#include "nodes/nodeFuncs.h"
 
 /* local functions */
 static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo);
+static bool	sortclause_is_unique_on_restrictinfo(Query *query,
+												 List *clause_list, List *sortclause);
 static void remove_rel_from_query(PlannerInfo *root, int relid,
 					  Relids joinrelids);
 static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved);
@@ -154,11 +160,13 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
 
 	/*
 	 * Currently, we only know how to remove left joins to a baserel with
-	 * unique indexes.  We can check most of these criteria pretty trivially
-	 * to avoid doing useless extra work.  But checking whether any of the
-	 * indexes are unique would require iterating over the indexlist, so for
-	 * now we just make sure there are indexes of some sort or other.  If none
-	 * of them are unique, join removal will still fail, just slightly later.
+	 * unique indexes and left joins to a subquery where the subquery is
+	 * unique on the join condition. We can check most of these criteria
+	 * pretty trivially to avoid doing useless extra work.  But checking
+	 * whether any of the indexes are unique would require iterating over
+	 * the indexlist, so for now, if we're joining to a relation, we'll just
+	 * ensure that we have at least 1 index, it won't matter if that index
+	 * is unique at this stage, we'll check those details later.
 	 */
 	if (sjinfo->jointype != JOIN_LEFT ||
 		sjinfo->delay_upper_joins ||
@@ -168,11 +176,17 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
 	innerrelid = bms_singleton_member(sjinfo->min_righthand);
 	innerrel = find_base_rel(root, innerrelid);
 
-	if (innerrel->reloptkind != RELOPT_BASEREL ||
-		innerrel->rtekind != RTE_RELATION ||
-		innerrel->indexlist == NIL)
+	if (innerrel->reloptkind != RELOPT_BASEREL)
 		return false;
 
+	if (innerrel->rtekind == RTE_RELATION)
+	{
+		if (innerrel->indexlist == NIL)
+			return false; /* no possibility of a unique index */
+	}
+	else if (innerrel->rtekind != RTE_SUBQUERY)
+		return false; /* unsupported rtekind */
+
 	/* Compute the relid set for the join we are considering */
 	joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand);
 
@@ -276,16 +290,143 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo)
 	 */
 
 	/* Now examine the indexes to see if we have a matching unique index */
-	if (relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL))
+	if (innerrel->rtekind == RTE_RELATION &&
+		relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL))
 		return true;
 
 	/*
+	 * We can be certain that the sub query contains no duplicate values for
+	 * the join clause if item in the sub query's GROUP BY clause is also used
+	 * in the join clause using equality. This works the same way for the
+	 * DISTINCT clause. We need not pay any attention to WHERE or HAVING
+	 * clauses as these just restrict the results more and could not be the
+	 * cause of duplication in the result set. However there are a number of
+	 * pre-checks we must perform before we even look at the GROUP BY or DISTINCT
+	 * clauses. These are described below.
+	 */
+	if (innerrel->rtekind == RTE_SUBQUERY)
+	{
+		Query *subquery = root->simple_rte_array[innerrelid]->subquery;
+
+		/*
+		 * We cannot remove the subquery if the target list contains any set returning
+		 * functions as these may cause the query not to be unique on the grouping
+		 * columns, as per the following example:
+		 * select a.a,generate_series(1,10) from (values(1)) a(a) group by a
+		 */
+		if (expression_returns_set((Node *) subquery->targetList))
+			return false;
+
+		/*
+		 * Don't remove the join if the target list contains any volatile functions.
+		 * Doing so may remove desired side affects that calls to the function may
+		 * cause.
+		 */
+		if (contain_volatile_functions((Node *) subquery->targetList))
+			return false;
+
+		/*
+		 * It should be safe to remove the join if all the group by expressions have matching
+		 * items in the join condition.
+		 */
+		if (subquery->groupClause != NIL &&
+			sortclause_is_unique_on_restrictinfo(subquery, clause_list, subquery->groupClause))
+			return true;
+
+		/*
+		 * It should be safe to remove the join if all the distinct column list have matching
+		 * items in the join condition.
+		 */
+		if (subquery->distinctClause != NIL &&
+			sortclause_is_unique_on_restrictinfo(subquery, clause_list, subquery->distinctClause))
+			return true;
+
+		/*
+		 * Note that we must also not remove the join in the subquery contains
+		 * a FOR UDPATE. We can actually skip this check as GROUP BY or DISTINCT
+		 * cannot be used at the same time as FOR UPDATE
+		 */
+	}
+	/* XXX is this comment still needed??
 	 * Some day it would be nice to check for other methods of establishing
 	 * distinctness.
 	 */
 	return false;
 }
 
+/*
+ * sortclause_is_unique_on_restrictinfo
+ *
+ * Checks to see if all items in sortclause also exist in clause_list.
+ * The function will return true if clause_list is the same as or a superset
+ * of the sortclause. If the sortclause has columns that don't exist in the
+ * clause_list then the query can't be guaranteed unique on the clause_list
+ * columns. Also if the targetList expression contains any volatile functions
+ * then we return false as something like:
+ * SELECT DISTINCT a+random() FROM (VALUES(1),(1)) a(a);
+ * will almost always return more than 1 row.
+ *
+ * Note: The calling function must ensure that sortclause is not NIL.
+ */
+static bool
+sortclause_is_unique_on_restrictinfo(Query *query, List *clause_list, List *sortclause)
+{
+	ListCell *l;
+
+	Assert(sortclause != NIL);
+
+	/*
+	 * Loop over each sort clause to ensure that we have
+	 * an item in the join conditions that matches it.
+	 * It does not matter if we have more items in the join
+	 * condition than we have in the sort clause.
+	 */
+	foreach(l, sortclause)
+	{
+		ListCell		*ri;
+		SortGroupClause *scl = (SortGroupClause *) lfirst(l);
+		TargetEntry		*sortTarget;
+		bool			 matched = false;
+
+		/* lookup the target list entry for the current sort sort group ref */
+		sortTarget = get_sortgroupref_tle(scl->tleSortGroupRef, query->targetList);
+
+		/*
+		 * Since a constant only has 1 value the existence of one here will
+		 * not cause any duplication of the results. We'll simply ignore it!
+		 */
+		if (IsA(sortTarget->expr, Const))
+			continue;
+
+		foreach(ri, clause_list)
+		{
+			RestrictInfo *rinfo = (RestrictInfo *) lfirst(ri);
+			Node	   *rexpr;
+
+			if (rinfo->outer_is_left)
+				rexpr = get_rightop(rinfo->clause);
+			else
+				rexpr = get_leftop(rinfo->clause);
+
+			if (IsA(rexpr, Var))
+			{
+				Var *var = (Var *)rexpr;
+
+				if (var->varattno == sortTarget->resno)
+				{
+					matched = true;
+					break; /* match found */
+				}
+			}
+			else /* XXX what else could it be? */
+				return false;
+		}
+
+		if (!matched)
+			return false;
+	}
+	return true;
+}
 
 /*
  * Remove the target relid from the planner's data structures, having
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 934488a..ff13c76 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -3060,9 +3060,11 @@ begin;
 CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int);
 CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int);
 CREATE TEMP TABLE c (id int PRIMARY KEY);
+CREATE TEMP TABLE d (a INT, b INT);
 INSERT INTO a VALUES (0, 0), (1, NULL);
 INSERT INTO b VALUES (0, 0), (1, NULL);
 INSERT INTO c VALUES (0), (1);
+INSERT INTO d VALUES (1,3),(2,2),(3,1);
 -- all three cases should be optimizable into a simple seqscan
 explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id;
   QUERY PLAN   
@@ -3098,6 +3100,151 @@ select id from a where id in (
          ->  Seq Scan on b
 (5 rows)
 
+-- check optimization of outer join when joining a unique sub query using group by
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id) b ON a.id = b.id;
+  QUERY PLAN   
+---------------
+ Seq Scan on a
+(1 row)
+
+-- check optimization of outer join when joining a unique sub query using distinct
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id FROM b) b ON a.id = b.c_id;
+  QUERY PLAN   
+---------------
+ Seq Scan on a
+(1 row)
+
+-- check optimization of outer join when joining a unique sub query using distinct
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b AS ab FROM d) d ON a.id = d.ab;
+  QUERY PLAN   
+---------------
+ Seq Scan on a
+(1 row)
+
+-- optimization is not possible when distinct contains a volatile function
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b+random() AS abr FROM d) d ON a.id = d.abr;
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((a.id)::double precision = ((((d.a + d.b))::double precision + random())))
+   ->  Seq Scan on a
+   ->  Hash
+         ->  HashAggregate
+               Group Key: (((d.a + d.b))::double precision + random())
+               ->  Seq Scan on d
+(7 rows)
+
+-- check optimization of outer join when joining a unique sub query using distinct
+-- and a constant expression.
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id,1 AS dummy FROM b) b ON a.id = b.c_id;
+  QUERY PLAN   
+---------------
+ Seq Scan on a
+(1 row)
+
+-- check optimization of outer join when joining a unique sub query which contains 2 tables
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id;
+  QUERY PLAN   
+---------------
+ Seq Scan on a
+(1 row)
+
+-- check optimization of outer join when joining a unique sub query which contains 2 tables
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id + 10 = b.id;
+  QUERY PLAN   
+---------------
+ Seq Scan on a
+(1 row)
+
+-- check optimization of outer join when joining a unique sub query which contains 2 tables
+-- and contains a redundant join clause
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a
+LEFT JOIN (SELECT b.id,1 as dummy FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id AND b.dummy = 1;
+  QUERY PLAN   
+---------------
+ Seq Scan on a
+(1 row)
+
+-- optimization is not possible when the group by contains a column which is not in the join condition.
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id,b.c_id) b ON a.id = b.id;
+           QUERY PLAN            
+---------------------------------
+ Hash Right Join
+   Hash Cond: (b.id = a.id)
+   ->  HashAggregate
+         Group Key: b.id, b.c_id
+         ->  Seq Scan on b
+   ->  Hash
+         ->  Seq Scan on a
+(7 rows)
+
+-- optimization is not possible when distinct clause contains an item that is not in the join clause
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,c_id FROM b) b ON a.id = b.id;
+           QUERY PLAN            
+---------------------------------
+ Hash Right Join
+   Hash Cond: (b.id = a.id)
+   ->  HashAggregate
+         Group Key: b.id, b.c_id
+         ->  Seq Scan on b
+   ->  Hash
+         ->  Seq Scan on a
+(7 rows)
+
+-- optimization is not possible when distinct contains a volatile function
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,random() AS r FROM b) b ON a.id = b.id AND r = random();
+                   QUERY PLAN                    
+-------------------------------------------------
+ Hash Left Join
+   Hash Cond: (a.id = b.id)
+   ->  Seq Scan on a
+   ->  Hash
+         ->  Subquery Scan on b
+               Filter: (b.r = random())
+               ->  HashAggregate
+                     Group Key: b_1.id, random()
+                     ->  Seq Scan on b b_1
+(9 rows)
+
+-- optimization is not possible when there are any volatile functions in the target list.
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT id,AVG(c_id),SUM(random()) FROM b GROUP BY id) b ON a.id = b.id;
+         QUERY PLAN         
+----------------------------
+ Hash Right Join
+   Hash Cond: (b.id = a.id)
+   ->  HashAggregate
+         Group Key: b.id
+         ->  Seq Scan on b
+   ->  Hash
+         ->  Seq Scan on a
+(7 rows)
+
+-- optimization is not possible when there are set returning functions in the target list.
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT id,generate_series(1,2) FROM b GROUP BY id) b ON a.id = b.id;
+         QUERY PLAN         
+----------------------------
+ Hash Right Join
+   Hash Cond: (b.id = a.id)
+   ->  HashAggregate
+         Group Key: b.id
+         ->  Seq Scan on b
+   ->  Hash
+         ->  Seq Scan on a
+(7 rows)
+
 rollback;
 create temp table parent (k int primary key, pd int);
 create temp table child (k int unique, cd int);
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 275cb11..d00e7fe 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -861,9 +861,11 @@ begin;
 CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int);
 CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int);
 CREATE TEMP TABLE c (id int PRIMARY KEY);
+CREATE TEMP TABLE d (a INT, b INT);
 INSERT INTO a VALUES (0, 0), (1, NULL);
 INSERT INTO b VALUES (0, 0), (1, NULL);
 INSERT INTO c VALUES (0), (1);
+INSERT INTO d VALUES (1,3),(2,2),(3,1);
 
 -- all three cases should be optimizable into a simple seqscan
 explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id;
@@ -878,6 +880,61 @@ select id from a where id in (
 	select b.id from b left join c on b.id = c.id
 );
 
+-- check optimization of outer join when joining a unique sub query using group by
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id) b ON a.id = b.id;
+
+-- check optimization of outer join when joining a unique sub query using distinct
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id FROM b) b ON a.id = b.c_id;
+
+-- check optimization of outer join when joining a unique sub query using distinct
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b AS ab FROM d) d ON a.id = d.ab;
+
+-- optimization is not possible when distinct contains a volatile function
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b+random() AS abr FROM d) d ON a.id = d.abr;
+
+-- check optimization of outer join when joining a unique sub query using distinct
+-- and a constant expression.
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id,1 AS dummy FROM b) b ON a.id = b.c_id;
+
+-- check optimization of outer join when joining a unique sub query which contains 2 tables
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id;
+
+-- check optimization of outer join when joining a unique sub query which contains 2 tables
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id + 10 = b.id;
+
+-- check optimization of outer join when joining a unique sub query which contains 2 tables
+-- and contains a redundant join clause
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a
+LEFT JOIN (SELECT b.id,1 as dummy FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id AND b.dummy = 1;
+
+-- optimization is not possible when the group by contains a column which is not in the join condition.
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id,b.c_id) b ON a.id = b.id;
+
+-- optimization is not possible when distinct clause contains an item that is not in the join clause
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,c_id FROM b) b ON a.id = b.id;
+
+-- optimization is not possible when distinct contains a volatile function
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,random() AS r FROM b) b ON a.id = b.id AND r = random();
+
+-- optimization is not possible when there are any volatile functions in the target list.
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT id,AVG(c_id),SUM(random()) FROM b GROUP BY id) b ON a.id = b.id;
+
+-- optimization is not possible when there are set returning functions in the target list.
+EXPLAIN (COSTS OFF)
+SELECT a.id FROM a LEFT JOIN (SELECT id,generate_series(1,2) FROM b GROUP BY id) b ON a.id = b.id;
+
 rollback;
 
 create temp table parent (k int primary key, pd int);