missing indexes in indexlist with partitioned tables

A.Roland@index.de

about 4 years ago

In reply to: Arne Roland (#1)

1 attachment(s)

Re: missing indexes in indexlist with partitioned tables

Hi,

I stumbled across a few places that depend on the inheritance appends being applied at a later date, so I quickly abandoned that idea. I thought a bit about the indexlist, in particular the inhparent, and I am not sure what depends on get_relation_info working in that way.

Therefore I propose a new attribute partIndexlist of RelOptInfo to include information about uniqueness, in the case the executor can't use the structure that causes the uniqueness to begin with. Said attribute can be used by relation_has_unique_index_for and rel_supports_distinctness.

The attached patch takes that route. I'd appreciate feedback!

Regards
Arne

Attachments:

partIndexlistClean.patchtext/x-patch; name=partIndexlistClean.patchDownload

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 05221cc1d6..e3dca4b465 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -2375,7 +2375,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel)
 
 	ereport(elevel,
 			(errmsg("table \"%s\": removed %lld dead item identifiers in %u pages",
-					vacrel->relname, (long long ) tupindex, vacuumed_pages),
+					vacrel->relname, (long long) tupindex, vacuumed_pages),
 			 errdetail_internal("%s", pg_rusage_show(&ru0))));
 
 	/* Revert to the previous phase information for error traceback */
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f547efd294..94fca8cf8b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -906,7 +906,7 @@ static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
 								TimeLineID prevTLI);
 static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
 									  XLogReaderState *state);
-static int LocalSetXLogInsertAllowed(void);
+static int	LocalSetXLogInsertAllowed(void);
 static void CreateEndOfRecoveryRecord(void);
 static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
@@ -5753,38 +5753,38 @@ CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog)
 	 * We switched to a new timeline. Clean up segments on the old timeline.
 	 *
 	 * If there are any higher-numbered segments on the old timeline, remove
-	 * them. They might contain valid WAL, but they might also be pre-allocated
-	 * files containing garbage. In any case, they are not part of the new
-	 * timeline's history so we don't need them.
+	 * them. They might contain valid WAL, but they might also be
+	 * pre-allocated files containing garbage. In any case, they are not part
+	 * of the new timeline's history so we don't need them.
 	 */
 	RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
 
 	/*
 	 * If the switch happened in the middle of a segment, what to do with the
 	 * last, partial segment on the old timeline? If we don't archive it, and
-	 * the server that created the WAL never archives it either (e.g. because it
-	 * was hit by a meteor), it will never make it to the archive. That's OK
-	 * from our point of view, because the new segment that we created with the
-	 * new TLI contains all the WAL from the old timeline up to the switch
+	 * the server that created the WAL never archives it either (e.g. because
+	 * it was hit by a meteor), it will never make it to the archive. That's
+	 * OK from our point of view, because the new segment that we created with
+	 * the new TLI contains all the WAL from the old timeline up to the switch
 	 * point. But if you later try to do PITR to the "missing" WAL on the old
-	 * timeline, recovery won't find it in the archive. It's physically present
-	 * in the new file with new TLI, but recovery won't look there when it's
-	 * recovering to the older timeline. On the other hand, if we archive the
-	 * partial segment, and the original server on that timeline is still
-	 * running and archives the completed version of the same segment later, it
-	 * will fail. (We used to do that in 9.4 and below, and it caused such
-	 * problems).
+	 * timeline, recovery won't find it in the archive. It's physically
+	 * present in the new file with new TLI, but recovery won't look there
+	 * when it's recovering to the older timeline. On the other hand, if we
+	 * archive the partial segment, and the original server on that timeline
+	 * is still running and archives the completed version of the same segment
+	 * later, it will fail. (We used to do that in 9.4 and below, and it
+	 * caused such problems).
 	 *
-	 * As a compromise, we rename the last segment with the .partial suffix, and
-	 * archive it. Archive recovery will never try to read .partial segments, so
-	 * they will normally go unused. But in the odd PITR case, the administrator
-	 * can copy them manually to the pg_wal directory (removing the suffix).
-	 * They can be useful in debugging, too.
+	 * As a compromise, we rename the last segment with the .partial suffix,
+	 * and archive it. Archive recovery will never try to read .partial
+	 * segments, so they will normally go unused. But in the odd PITR case,
+	 * the administrator can copy them manually to the pg_wal directory
+	 * (removing the suffix). They can be useful in debugging, too.
 	 *
 	 * If a .done or .ready file already exists for the old timeline, however,
-	 * we had already determined that the segment is complete, so we can let it
-	 * be archived normally. (In particular, if it was restored from the archive
-	 * to begin with, it's expected to have a .done file).
+	 * we had already determined that the segment is complete, so we can let
+	 * it be archived normally. (In particular, if it was restored from the
+	 * archive to begin with, it's expected to have a .done file).
 	 */
 	if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
 		XLogArchivingActive())
@@ -8100,10 +8100,10 @@ StartupXLOG(void)
 	 * Emit checkpoint or end-of-recovery record in XLOG, if required.
 	 *
 	 * XLogCtl->lastReplayedEndRecPtr will be a valid LSN if and only if we
-	 * entered recovery. Even if we ultimately replayed no WAL records, it will
-	 * have been initialized based on where replay was due to start.  We don't
-	 * need a lock to access this, since this can't change any more by the time
-	 * we reach this code.
+	 * entered recovery. Even if we ultimately replayed no WAL records, it
+	 * will have been initialized based on where replay was due to start.  We
+	 * don't need a lock to access this, since this can't change any more by
+	 * the time we reach this code.
 	 */
 	if (!XLogRecPtrIsInvalid(XLogCtl->lastReplayedEndRecPtr))
 		promoted = PerformRecoveryXLogAction();
@@ -8292,15 +8292,15 @@ PerformRecoveryXLogAction(void)
 	/*
 	 * Perform a checkpoint to update all our recovery activity to disk.
 	 *
-	 * Note that we write a shutdown checkpoint rather than an on-line one. This
-	 * is not particularly critical, but since we may be assigning a new TLI,
-	 * using a shutdown checkpoint allows us to have the rule that TLI only
-	 * changes in shutdown checkpoints, which allows some extra error checking
-	 * in xlog_redo.
+	 * Note that we write a shutdown checkpoint rather than an on-line one.
+	 * This is not particularly critical, but since we may be assigning a new
+	 * TLI, using a shutdown checkpoint allows us to have the rule that TLI
+	 * only changes in shutdown checkpoints, which allows some extra error
+	 * checking in xlog_redo.
 	 *
-	 * In promotion, only create a lightweight end-of-recovery record instead of
-	 * a full checkpoint. A checkpoint is requested later, after we're fully out
-	 * of recovery mode and already accepting queries.
+	 * In promotion, only create a lightweight end-of-recovery record instead
+	 * of a full checkpoint. A checkpoint is requested later, after we're
+	 * fully out of recovery mode and already accepting queries.
 	 */
 	if (ArchiveRecoveryRequested && IsUnderPostmaster &&
 		LocalPromoteIsTriggered)
@@ -8310,11 +8310,11 @@ PerformRecoveryXLogAction(void)
 		/*
 		 * Insert a special WAL record to mark the end of recovery, since we
 		 * aren't doing a checkpoint. That means that the checkpointer process
-		 * may likely be in the middle of a time-smoothed restartpoint and could
-		 * continue to be for minutes after this.  That sounds strange, but the
-		 * effect is roughly the same and it would be stranger to try to come
-		 * out of the restartpoint and then checkpoint. We request a checkpoint
-		 * later anyway, just for safety.
+		 * may likely be in the middle of a time-smoothed restartpoint and
+		 * could continue to be for minutes after this.  That sounds strange,
+		 * but the effect is roughly the same and it would be stranger to try
+		 * to come out of the restartpoint and then checkpoint. We request a
+		 * checkpoint later anyway, just for safety.
 		 */
 		CreateEndOfRecoveryRecord();
 	}
@@ -8486,7 +8486,7 @@ XLogInsertAllowed(void)
 static int
 LocalSetXLogInsertAllowed(void)
 {
-	int		oldXLogAllowed = LocalXLogInsertAllowed;
+	int			oldXLogAllowed = LocalXLogInsertAllowed;
 
 	LocalXLogInsertAllowed = 1;
 
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 81cc39fb70..97ddd2b24a 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -3550,9 +3550,8 @@ restart:
 		/*
 		 * If this constraint has a parent constraint which we have not seen
 		 * yet, keep track of it for the second loop, below.  Tracking parent
-		 * constraints allows us to climb up to the top-level constraint
-		 * and look for all possible relations referencing the partitioned
-		 * table.
+		 * constraints allows us to climb up to the top-level constraint and
+		 * look for all possible relations referencing the partitioned table.
 		 */
 		if (OidIsValid(con->conparentid) &&
 			!list_member_oid(parent_cons, con->conparentid))
diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c
index 2bae3fbb17..86b9813300 100644
--- a/src/backend/catalog/objectaddress.c
+++ b/src/backend/catalog/objectaddress.c
@@ -856,7 +856,7 @@ const ObjectAddress InvalidObjectAddress =
 };
 
 static ObjectAddress get_object_address_unqualified(ObjectType objtype,
-													String *strval, bool missing_ok);
+													String * strval, bool missing_ok);
 static ObjectAddress get_relation_by_qualified_name(ObjectType objtype,
 													List *object, Relation *relp,
 													LOCKMODE lockmode, bool missing_ok);
@@ -1255,7 +1255,7 @@ get_object_address_rv(ObjectType objtype, RangeVar *rel, List *object,
  */
 static ObjectAddress
 get_object_address_unqualified(ObjectType objtype,
-							   String *strval, bool missing_ok)
+							   String * strval, bool missing_ok)
 {
 	const char *name;
 	ObjectAddress address;
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 4928702aec..9b7bb1b3be 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -428,7 +428,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params,
 	 */
 	if (onerel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 	{
-		List *idxs = RelationGetIndexList(onerel);
+		List	   *idxs = RelationGetIndexList(onerel);
 
 		Irel = NULL;
 		nindexes = 0;
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 9d22f648a8..ffba47926d 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -1529,8 +1529,8 @@ finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap,
 
 			/*
 			 * Reset the relrewrite for the toast. The command-counter
-			 * increment is required here as we are about to update
-			 * the tuple that is updated as part of RenameRelationInternal.
+			 * increment is required here as we are about to update the tuple
+			 * that is updated as part of RenameRelationInternal.
 			 */
 			CommandCounterIncrement();
 			ResetRelRewrite(newrel->rd_rel->reltoastrelid);
diff --git a/src/backend/commands/publicationcmds.c b/src/backend/commands/publicationcmds.c
index d1fff13d2e..21bf50a81c 100644
--- a/src/backend/commands/publicationcmds.c
+++ b/src/backend/commands/publicationcmds.c
@@ -813,7 +813,7 @@ RemovePublicationById(Oid pubid)
 	if (!HeapTupleIsValid(tup))
 		elog(ERROR, "cache lookup failed for publication %u", pubid);
 
-	pubform = (Form_pg_publication)GETSTRUCT(tup);
+	pubform = (Form_pg_publication) GETSTRUCT(tup);
 
 	/* Invalidate relcache so that publication info is rebuilt. */
 	if (pubform->puballtables)
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
index 8f1550ec80..e2fe1e174e 100644
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -261,9 +261,9 @@ CreateStatistics(CreateStatsStmt *stmt)
 			nattnums++;
 			ReleaseSysCache(atttuple);
 		}
-		else if (IsA(selem->expr, Var))	/* column reference in parens */
+		else if (IsA(selem->expr, Var)) /* column reference in parens */
 		{
-			Var *var = (Var *) selem->expr;
+			Var		   *var = (Var *) selem->expr;
 			TypeCacheEntry *type;
 
 			/* Disallow use of system attributes in extended stats */
@@ -300,10 +300,11 @@ CreateStatistics(CreateStatsStmt *stmt)
 			while ((k = bms_next_member(attnums, k)) >= 0)
 			{
 				AttrNumber	attnum = k + FirstLowInvalidHeapAttributeNumber;
+
 				if (attnum <= 0)
 					ereport(ERROR,
-						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-						 errmsg("statistics creation on system columns is not supported")));
+							(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+							 errmsg("statistics creation on system columns is not supported")));
 			}
 
 			/*
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 857cc5ce6e..28a051f4ff 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -17525,12 +17525,12 @@ ATExecAttachPartition(List **wqueue, Relation rel, PartitionCmd *cmd,
 	/*
 	 * If the partition we just attached is partitioned itself, invalidate
 	 * relcache for all descendent partitions too to ensure that their
-	 * rd_partcheck expression trees are rebuilt; partitions already locked
-	 * at the beginning of this function.
+	 * rd_partcheck expression trees are rebuilt; partitions already locked at
+	 * the beginning of this function.
 	 */
 	if (attachrel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 	{
-		ListCell *l;
+		ListCell   *l;
 
 		foreach(l, attachrel_children)
 		{
@@ -18232,13 +18232,13 @@ DetachPartitionFinalize(Relation rel, Relation partRel, bool concurrent,
 	/*
 	 * If the partition we just detached is partitioned itself, invalidate
 	 * relcache for all descendent partitions too to ensure that their
-	 * rd_partcheck expression trees are rebuilt; must lock partitions
-	 * before doing so, using the same lockmode as what partRel has been
-	 * locked with by the caller.
+	 * rd_partcheck expression trees are rebuilt; must lock partitions before
+	 * doing so, using the same lockmode as what partRel has been locked with
+	 * by the caller.
 	 */
 	if (partRel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
 	{
-		List   *children;
+		List	   *children;
 
 		children = find_all_inheritors(RelationGetRelid(partRel),
 									   AccessExclusiveLock, NULL);
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index d328856ae5..d13fbf4590 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -715,7 +715,7 @@ ExecInsert(ModifyTableState *mtstate,
 			{
 				TupleDesc	tdesc = CreateTupleDescCopy(slot->tts_tupleDescriptor);
 				TupleDesc	plan_tdesc =
-					CreateTupleDescCopy(planSlot->tts_tupleDescriptor);
+				CreateTupleDescCopy(planSlot->tts_tupleDescriptor);
 
 				resultRelInfo->ri_Slots[resultRelInfo->ri_NumSlots] =
 					MakeSingleTupleTableSlot(tdesc, slot->tts_ops);
diff --git a/src/backend/jit/llvm/llvmjit.c b/src/backend/jit/llvm/llvmjit.c
index 169dad96d7..4e7212e7bf 100644
--- a/src/backend/jit/llvm/llvmjit.c
+++ b/src/backend/jit/llvm/llvmjit.c
@@ -890,8 +890,8 @@ llvm_shutdown(int code, Datum arg)
 	 * has occurred in the middle of LLVM code. It is not safe to call back
 	 * into LLVM (which is why a FATAL error was thrown).
 	 *
-	 * We do need to shutdown LLVM in other shutdown cases, otherwise
-	 * e.g. profiling data won't be written out.
+	 * We do need to shutdown LLVM in other shutdown cases, otherwise e.g.
+	 * profiling data won't be written out.
 	 */
 	if (llvm_in_fatal_on_oom())
 	{
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 82464c9889..232cf37d22 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -4925,9 +4925,9 @@ _copyExtensibleNode(const ExtensibleNode *from)
  * ****************************************************************
  */
 static Integer *
-_copyInteger(const Integer *from)
+_copyInteger(const Integer * from)
 {
-	Integer	   *newnode = makeNode(Integer);
+	Integer    *newnode = makeNode(Integer);
 
 	COPY_SCALAR_FIELD(val);
 
@@ -4935,7 +4935,7 @@ _copyInteger(const Integer *from)
 }
 
 static Float *
-_copyFloat(const Float *from)
+_copyFloat(const Float * from)
 {
 	Float	   *newnode = makeNode(Float);
 
@@ -4945,7 +4945,7 @@ _copyFloat(const Float *from)
 }
 
 static String *
-_copyString(const String *from)
+_copyString(const String * from)
 {
 	String	   *newnode = makeNode(String);
 
@@ -4955,9 +4955,9 @@ _copyString(const String *from)
 }
 
 static BitString *
-_copyBitString(const BitString *from)
+_copyBitString(const BitString * from)
 {
-	BitString   *newnode = makeNode(BitString);
+	BitString  *newnode = makeNode(BitString);
 
 	COPY_STRING_FIELD(val);
 
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index f537d3eb96..fca72c8e45 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -2437,8 +2437,7 @@ static bool
 _equalA_Const(const A_Const *a, const A_Const *b)
 {
 	/*
-	 * Hack for in-line val field.  Also val is not valid is isnull is
-	 * true.
+	 * Hack for in-line val field.  Also val is not valid is isnull is true.
 	 */
 	if (!a->isnull && !b->isnull &&
 		!equal(&a->val, &b->val))
@@ -3122,7 +3121,7 @@ _equalList(const List *a, const List *b)
  */
 
 static bool
-_equalInteger(const Integer *a, const Integer *b)
+_equalInteger(const Integer * a, const Integer * b)
 {
 	COMPARE_SCALAR_FIELD(val);
 
@@ -3130,7 +3129,7 @@ _equalInteger(const Integer *a, const Integer *b)
 }
 
 static bool
-_equalFloat(const Float *a, const Float *b)
+_equalFloat(const Float * a, const Float * b)
 {
 	COMPARE_STRING_FIELD(val);
 
@@ -3138,7 +3137,7 @@ _equalFloat(const Float *a, const Float *b)
 }
 
 static bool
-_equalString(const String *a, const String *b)
+_equalString(const String * a, const String * b)
 {
 	COMPARE_STRING_FIELD(val);
 
@@ -3146,7 +3145,7 @@ _equalString(const String *a, const String *b)
 }
 
 static bool
-_equalBitString(const BitString *a, const BitString *b)
+_equalBitString(const BitString * a, const BitString * b)
 {
 	COMPARE_STRING_FIELD(val);
 
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 2e5ed77e18..bd5aba02ef 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -3414,27 +3414,27 @@ _outA_Expr(StringInfo str, const A_Expr *node)
 }
 
 static void
-_outInteger(StringInfo str, const Integer *node)
+_outInteger(StringInfo str, const Integer * node)
 {
 	appendStringInfo(str, "%d", node->val);
 }
 
 static void
-_outFloat(StringInfo str, const Float *node)
+_outFloat(StringInfo str, const Float * node)
 {
 	/*
-	 * We assume the value is a valid numeric literal and so does not
-	 * need quoting.
+	 * We assume the value is a valid numeric literal and so does not need
+	 * quoting.
 	 */
 	appendStringInfoString(str, node->val);
 }
 
 static void
-_outString(StringInfo str, const String *node)
+_outString(StringInfo str, const String * node)
 {
 	/*
-	 * We use outToken to provide escaping of the string's content,
-	 * but we don't want it to do anything with an empty string.
+	 * We use outToken to provide escaping of the string's content, but we
+	 * don't want it to do anything with an empty string.
 	 */
 	appendStringInfoChar(str, '"');
 	if (node->val[0] != '\0')
@@ -3443,7 +3443,7 @@ _outString(StringInfo str, const String *node)
 }
 
 static void
-_outBitString(StringInfo str, const BitString *node)
+_outBitString(StringInfo str, const BitString * node)
 {
 	/* internal representation already has leading 'b' */
 	appendStringInfoString(str, node->val);
diff --git a/src/backend/nodes/value.c b/src/backend/nodes/value.c
index 515f93c223..980245ed5d 100644
--- a/src/backend/nodes/value.c
+++ b/src/backend/nodes/value.c
@@ -22,7 +22,7 @@
 Integer *
 makeInteger(int i)
 {
-	Integer	   *v = makeNode(Integer);
+	Integer    *v = makeNode(Integer);
 
 	v->val = i;
 	return v;
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 0e4e00eaf0..8475c18560 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -3517,7 +3517,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 	Assert(list_length(exprlist) == list_length(oprlist));
 
 	/* Short-circuit if no indexes... */
-	if (rel->indexlist == NIL)
+	if (rel->indexlist == NIL && rel->partIndexlist == NIL)
 		return false;
 
 	/*
@@ -3562,7 +3562,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 		return false;
 
 	/* Examine each index of the relation ... */
-	foreach(ic, rel->indexlist)
+	foreach(ic, list_concat(rel->indexlist, rel->partIndexlist))
 	{
 		IndexOptInfo *ind = (IndexOptInfo *) lfirst(ic);
 		int			c;
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 37eb64bcef..3ea53d520f 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -23,6 +23,7 @@
 #include "postgres.h"
 
 #include "nodes/nodeFuncs.h"
+#include "nodes/nodes.h"
 #include "optimizer/clauses.h"
 #include "optimizer/joininfo.h"
 #include "optimizer/optimizer.h"
@@ -598,7 +599,7 @@ rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
 		 */
 		ListCell   *lc;
 
-		foreach(lc, rel->indexlist)
+		foreach(lc, list_concat(rel->indexlist, rel->partIndexlist))
 		{
 			IndexOptInfo *ind = (IndexOptInfo *) lfirst(lc);
 
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index c5194fdbbf..71c996582d 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -117,6 +117,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	Relation	relation;
 	bool		hasindex;
 	List	   *indexinfos = NIL;
+	List	   *partIndexinfos = NIL;
 
 	/*
 	 * We need not lock the relation since it was already locked, either by
@@ -163,7 +164,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	else
 		hasindex = relation->rd_rel->relhasindex;
 
-	if (hasindex)
+	if (!(IgnoreSystemIndexes && !IsSystemRelation(relation)) && relation->rd_rel->relhasindex)
 	{
 		List	   *indexoidlist;
 		LOCKMODE	lmode;
@@ -212,10 +213,12 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			}
 
 			/*
-			 * Ignore partitioned indexes, since they are not usable for
-			 * queries.
+			 * Don't add partitioned indexes to the indexlist, since they are
+			 * not usable by the executor. If they are unique add them to the
+			 * partindexlist instead, to use for further pruning. If they
+			 * aren't that either, simply skip them.
 			 */
-			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			if (inhparent && (!index->indisunique || indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX))
 			{
 				index_close(indexRelation, NoLock);
 				continue;
@@ -263,7 +266,40 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->indexcollations[i] = indexRelation->rd_indcollation[i];
 			}
 
-			info->relam = indexRelation->rd_rel->relam;
+			/*
+			 * Fetch the index expressions and predicate, if any.  We must
+			 * modify the copies we obtain from the relcache to have the
+			 * correct varno for the parent relation, so that they match up
+			 * correctly against qual clauses.
+			 */
+			info->indexprs = RelationGetIndexExpressions(indexRelation);
+			info->indpred = RelationGetIndexPredicate(indexRelation);
+			if (info->indexprs && varno != 1)
+				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
+			if (info->indpred && varno != 1)
+				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
+
+			info->unique = index->indisunique;
+			info->immediate = index->indimmediate;
+
+			/*
+			 * Don't add partitioned indexes to the indexlist, add them to the
+			 * partindexlist instead, since they are not usable by the
+			 * executor.
+			 */
+			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			{
+				index_close(indexRelation, NoLock);
+				partIndexinfos = lappend(partIndexinfos, info);
+				continue;
+			}
+
+			info->hypothetical = false;
+			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
+			info->predOK = false;	/* set later, in indxpath.c */
+
+			/* Build targetlist using the completed indexprs data */
+			info->indextlist = build_index_tlist(root, info, relation);
 
 			/* We copy just the fields we need, not all of rd_indam */
 			amroutine = indexRelation->rd_indam;
@@ -283,6 +319,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			/* Fetch index opclass options */
 			info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
 
+			info->relam = indexRelation->rd_rel->relam;
+
 			/*
 			 * Fetch the ordering information for the index, if any.
 			 */
@@ -369,28 +407,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->nulls_first = NULL;
 			}
 
-			/*
-			 * Fetch the index expressions and predicate, if any.  We must
-			 * modify the copies we obtain from the relcache to have the
-			 * correct varno for the parent relation, so that they match up
-			 * correctly against qual clauses.
-			 */
-			info->indexprs = RelationGetIndexExpressions(indexRelation);
-			info->indpred = RelationGetIndexPredicate(indexRelation);
-			if (info->indexprs && varno != 1)
-				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
-			if (info->indpred && varno != 1)
-				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
-
-			/* Build targetlist using the completed indexprs data */
-			info->indextlist = build_index_tlist(root, info, relation);
-
-			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
-			info->predOK = false;	/* set later, in indxpath.c */
-			info->unique = index->indisunique;
-			info->immediate = index->indimmediate;
-			info->hypothetical = false;
-
 			/*
 			 * Estimate the index size.  If it's not a partial index, we lock
 			 * the number-of-tuples estimate to equal the parent table; if it
@@ -440,6 +456,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	}
 
 	rel->indexlist = indexinfos;
+	rel->partIndexlist = partIndexinfos;
 
 	rel->statlist = get_relation_statistics(rel, relation);
 
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index 146ee8dd1e..f165531a73 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -2039,8 +2039,8 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt,
 		ListCell   *ltl;
 		ListCell   *rtl;
 		const char *context;
-		bool recursive = (pstate->p_parent_cte &&
-						  pstate->p_parent_cte->cterecursive);
+		bool		recursive = (pstate->p_parent_cte &&
+								 pstate->p_parent_cte->cterecursive);
 
 		context = (stmt->op == SETOP_UNION ? "UNION" :
 				   (stmt->op == SETOP_INTERSECT ? "INTERSECT" :
@@ -2194,7 +2194,10 @@ transformSetOperationTree(ParseState *pstate, SelectStmt *stmt,
 				setup_parser_errposition_callback(&pcbstate, pstate,
 												  bestlocation);
 
-				/* If it's a recursive union, we need to require hashing support. */
+				/*
+				 * If it's a recursive union, we need to require hashing
+				 * support.
+				 */
 				op->groupClauses = lappend(op->groupClauses,
 										   makeSortGroupClauseForSetOp(rescoltype, recursive));
 
diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c
index 078029ba1f..3e450053c5 100644
--- a/src/backend/parser/parse_clause.c
+++ b/src/backend/parser/parse_clause.c
@@ -1998,7 +1998,7 @@ findTargetlistEntrySQL92(ParseState *pstate, Node *node, List **tlist,
 	}
 	if (IsA(node, A_Const))
 	{
-		A_Const	   *aconst = castNode(A_Const, node);
+		A_Const    *aconst = castNode(A_Const, node);
 		int			targetlist_pos = 0;
 		int			target_pos;
 
diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c
index fb49747f6e..3220d4808d 100644
--- a/src/backend/partitioning/partdesc.c
+++ b/src/backend/partitioning/partdesc.c
@@ -91,8 +91,8 @@ RelationGetPartitionDesc(Relation rel, bool omit_detached)
 	 * cached descriptor too.  We determine that based on the pg_inherits.xmin
 	 * that was saved alongside that descriptor: if the xmin that was not in
 	 * progress for that active snapshot is also not in progress for the
-	 * current active snapshot, then we can use it.  Otherwise build one
-	 * from scratch.
+	 * current active snapshot, then we can use it.  Otherwise build one from
+	 * scratch.
 	 */
 	if (omit_detached &&
 		rel->rd_partdesc_nodetached &&
diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index c05f500639..9d63ab4b1f 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -826,9 +826,9 @@ StartBackgroundWorker(void)
 
 	/*
 	 * Create a per-backend PGPROC struct in shared memory, except in the
-	 * EXEC_BACKEND case where this was done in SubPostmasterMain. We must
-	 * do this before we can use LWLocks (and in the EXEC_BACKEND case we
-	 * already had to do some stuff with LWLocks).
+	 * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
+	 * this before we can use LWLocks (and in the EXEC_BACKEND case we already
+	 * had to do some stuff with LWLocks).
 	 */
 #ifndef EXEC_BACKEND
 	InitProcess();
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index b7d0fbaefd..2d1d7e3ac5 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -263,7 +263,7 @@ typedef struct TwoPhasePgStatRecord
 	PgStat_Counter deleted_pre_truncdrop;
 	Oid			t_id;			/* table's OID */
 	bool		t_shared;		/* is it a shared catalog? */
-	bool		t_truncdropped;	/* was the relation truncated/dropped? */
+	bool		t_truncdropped; /* was the relation truncated/dropped? */
 } TwoPhasePgStatRecord;
 
 /*
@@ -361,7 +361,7 @@ static void pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len);
 static void pgstat_recv_analyze(PgStat_MsgAnalyze *msg, int len);
 static void pgstat_recv_archiver(PgStat_MsgArchiver *msg, int len);
 static void pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len);
-static void pgstat_recv_checkpointer(PgStat_MsgCheckpointer *msg, int len);
+static void pgstat_recv_checkpointer(PgStat_MsgCheckpointer * msg, int len);
 static void pgstat_recv_wal(PgStat_MsgWal *msg, int len);
 static void pgstat_recv_slru(PgStat_MsgSLRU *msg, int len);
 static void pgstat_recv_funcstat(PgStat_MsgFuncstat *msg, int len);
@@ -2530,11 +2530,11 @@ AtEOSubXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit, in
 			{
 				/*
 				 * When there isn't an immediate parent state, we can just
-				 * reuse the record instead of going through a
-				 * palloc/pfree pushup (this works since it's all in
-				 * TopTransactionContext anyway).  We have to re-link it
-				 * into the parent level, though, and that might mean
-				 * pushing a new entry into the pgStatXactStack.
+				 * reuse the record instead of going through a palloc/pfree
+				 * pushup (this works since it's all in TopTransactionContext
+				 * anyway).  We have to re-link it into the parent level,
+				 * though, and that might mean pushing a new entry into the
+				 * pgStatXactStack.
 				 */
 				PgStat_SubXactStatus *upper_xact_state;
 
@@ -3244,9 +3244,9 @@ pgstat_send_wal(bool force)
 		WalUsage	walusage;
 
 		/*
-		 * Calculate how much WAL usage counters were increased by
-		 * subtracting the previous counters from the current ones. Fill the
-		 * results in WAL stats message.
+		 * Calculate how much WAL usage counters were increased by subtracting
+		 * the previous counters from the current ones. Fill the results in
+		 * WAL stats message.
 		 */
 		MemSet(&walusage, 0, sizeof(WalUsage));
 		WalUsageAccumDiff(&walusage, &pgWalUsage, &prevWalUsage);
@@ -4077,7 +4077,7 @@ pgstat_read_statsfiles(Oid onlydb, bool permanent, bool deep)
 	bool		found;
 	const char *statfile = permanent ? PGSTAT_STAT_PERMANENT_FILENAME : pgstat_stat_filename;
 	int			i;
-	TimestampTz	ts;
+	TimestampTz ts;
 
 	/*
 	 * The tables will live in pgStatLocalContext.
@@ -5059,6 +5059,7 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len)
 			tabentry->tuples_updated += tabmsg->t_counts.t_tuples_updated;
 			tabentry->tuples_deleted += tabmsg->t_counts.t_tuples_deleted;
 			tabentry->tuples_hot_updated += tabmsg->t_counts.t_tuples_hot_updated;
+
 			/*
 			 * If table was truncated/dropped, first reset the live/dead
 			 * counters.
@@ -5221,7 +5222,10 @@ pgstat_recv_resetsharedcounter(PgStat_MsgResetsharedcounter *msg, int len)
 {
 	if (msg->m_resettarget == RESET_BGWRITER)
 	{
-		/* Reset the global, bgwriter and checkpointer statistics for the cluster. */
+		/*
+		 * Reset the global, bgwriter and checkpointer statistics for the
+		 * cluster.
+		 */
 		memset(&globalStats, 0, sizeof(globalStats));
 		globalStats.bgwriter.stat_reset_timestamp = GetCurrentTimestamp();
 	}
@@ -5501,7 +5505,7 @@ pgstat_recv_bgwriter(PgStat_MsgBgWriter *msg, int len)
  * ----------
  */
 static void
-pgstat_recv_checkpointer(PgStat_MsgCheckpointer *msg, int len)
+pgstat_recv_checkpointer(PgStat_MsgCheckpointer * msg, int len)
 {
 	globalStats.checkpointer.timed_checkpoints += msg->m_timed_checkpoints;
 	globalStats.checkpointer.requested_checkpoints += msg->m_requested_checkpoints;
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index 28e68dd871..1ddc942991 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -73,7 +73,7 @@ static volatile sig_atomic_t startup_progress_timer_expired = false;
 /*
  * Time between progress updates for long-running startup operations.
  */
-int log_startup_progress_interval = 10000;	/* 10 sec */
+int			log_startup_progress_interval = 10000;	/* 10 sec */
 
 /* Signal handlers */
 static void StartupProcTriggerHandler(SIGNAL_ARGS);
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
index 8bf80db8e4..abb0a095bf 100644
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -354,7 +354,7 @@ statext_dependencies_build(StatsBuildData *data)
 
 	/* result */
 	MVDependencies *dependencies = NULL;
-	MemoryContext	cxt;
+	MemoryContext cxt;
 
 	Assert(data->nattnums >= 2);
 
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index b350fc5f7b..18c7be8195 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -1619,7 +1619,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
 	Assert(mcvlist->nitems <= STATS_MCVLIST_MAX_ITEMS);
 
 	matches = palloc(sizeof(bool) * mcvlist->nitems);
-	memset(matches, !is_or,  sizeof(bool) * mcvlist->nitems);
+	memset(matches, !is_or, sizeof(bool) * mcvlist->nitems);
 
 	/*
 	 * Loop through the list of clauses, and for each of them evaluate all the
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 08ebabfe96..6003431e2c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -666,9 +666,8 @@ ReadRecentBuffer(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum,
 		{
 			/*
 			 * It's now safe to pin the buffer.  We can't pin first and ask
-			 * questions later, because it might confuse code paths
-			 * like InvalidateBuffer() if we pinned a random non-matching
-			 * buffer.
+			 * questions later, because it might confuse code paths like
+			 * InvalidateBuffer() if we pinned a random non-matching buffer.
 			 */
 			if (have_private_ref)
 				PinBuffer(bufHdr, NULL);	/* bump pin count */
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index cb1a8dd34f..40dc4d570f 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -911,7 +911,7 @@ InitFileAccess(void)
 void
 InitTemporaryFileAccess(void)
 {
-	Assert(SizeVfdCache != 0);	/* InitFileAccess() needs to have run*/
+	Assert(SizeVfdCache != 0);	/* InitFileAccess() needs to have run */
 	Assert(!temporary_files_allowed);	/* call me only once */
 
 	/*
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index bd3c7a47fe..eda80c8d97 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -255,7 +255,7 @@ typedef enum GlobalVisHorizonKind
 	VISHORIZON_CATALOG,
 	VISHORIZON_DATA,
 	VISHORIZON_TEMP
-} GlobalVisHorizonKind;
+}			GlobalVisHorizonKind;
 
 
 static ProcArrayStruct *procArray;
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
index b4ce9629d4..9bcdb5bfa5 100644
--- a/src/backend/storage/ipc/shm_mq.c
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -534,9 +534,9 @@ shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait,
 	}
 
 	/*
-	 * If the caller has requested force flush or we have written more than 1/4
-	 * of the ring size, mark it as written in shared memory and notify the
-	 * receiver.
+	 * If the caller has requested force flush or we have written more than
+	 * 1/4 of the ring size, mark it as written in shared memory and notify
+	 * the receiver.
 	 */
 	if (force_flush || mqh->mqh_send_pending > (mq->mq_ring_size >> 2))
 	{
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index c25af7fe09..653b371206 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -3237,7 +3237,7 @@ CheckForSessionAndXactLocks(void)
 		LOCKTAG		lock;		/* identifies the lockable object */
 		bool		sessLock;	/* is any lockmode held at session level? */
 		bool		xactLock;	/* is any lockmode held at xact level? */
-	} PerLockTagEntry;
+	}			PerLockTagEntry;
 
 	HASHCTL		hash_ctl;
 	HTAB	   *lockhtab;
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index 660e854e93..58d49179cc 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -3996,7 +3996,8 @@ hash_array(PG_FUNCTION_ARGS)
 
 			/*
 			 * Make fake type cache entry structure.  Note that we can't just
-			 * modify typentry, since that points directly into the type cache.
+			 * modify typentry, since that points directly into the type
+			 * cache.
 			 */
 			record_typentry = palloc0(sizeof(*record_typentry));
 			record_typentry->type_id = element_type;
diff --git a/src/backend/utils/adt/multirangetypes.c b/src/backend/utils/adt/multirangetypes.c
index 7773215564..f3d2e52dbd 100644
--- a/src/backend/utils/adt/multirangetypes.c
+++ b/src/backend/utils/adt/multirangetypes.c
@@ -2651,7 +2651,7 @@ multirange_unnest(PG_FUNCTION_ARGS)
 		MultirangeType *mr;
 		TypeCacheEntry *typcache;
 		int			index;
-	} multirange_unnest_fctx;
+	}			multirange_unnest_fctx;
 
 	FuncCallContext *funcctx;
 	multirange_unnest_fctx *fctx;
diff --git a/src/backend/utils/adt/rangetypes_spgist.c b/src/backend/utils/adt/rangetypes_spgist.c
index 912b43f083..9395b32576 100644
--- a/src/backend/utils/adt/rangetypes_spgist.c
+++ b/src/backend/utils/adt/rangetypes_spgist.c
@@ -608,8 +608,8 @@ spg_range_quad_inner_consistent(PG_FUNCTION_ARGS)
 					/*
 					 * Non-empty range A contains non-empty range B if lower
 					 * bound of A is lower or equal to lower bound of range B
-					 * and upper bound of range A is greater than or equal to upper
-					 * bound of range A.
+					 * and upper bound of range A is greater than or equal to
+					 * upper bound of range A.
 					 *
 					 * All non-empty ranges contain an empty range.
 					 */
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 777fab4915..24972e3a71 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -166,7 +166,7 @@ typedef struct InvalMessageArray
 {
 	SharedInvalidationMessage *msgs;	/* palloc'd array (can be expanded) */
 	int			maxmsgs;		/* current allocated size of array */
-} InvalMessageArray;
+}			InvalMessageArray;
 
 static InvalMessageArray InvalMessageArrays[2];
 
@@ -175,7 +175,7 @@ typedef struct InvalidationMsgsGroup
 {
 	int			firstmsg[2];	/* first index in relevant array */
 	int			nextmsg[2];		/* last+1 index */
-} InvalidationMsgsGroup;
+}			InvalidationMsgsGroup;
 
 /* Macros to help preserve InvalidationMsgsGroup abstraction */
 #define SetSubGroupToFollow(targetgroup, priorgroup, subgroup) \
@@ -287,7 +287,7 @@ static int	relcache_callback_count = 0;
  * subgroup must be CatCacheMsgs or RelCacheMsgs.
  */
 static void
-AddInvalidationMessage(InvalidationMsgsGroup *group, int subgroup,
+AddInvalidationMessage(InvalidationMsgsGroup * group, int subgroup,
 					   const SharedInvalidationMessage *msg)
 {
 	InvalMessageArray *ima = &InvalMessageArrays[subgroup];
@@ -327,8 +327,8 @@ AddInvalidationMessage(InvalidationMsgsGroup *group, int subgroup,
  * the source subgroup to empty.
  */
 static void
-AppendInvalidationMessageSubGroup(InvalidationMsgsGroup *dest,
-								  InvalidationMsgsGroup *src,
+AppendInvalidationMessageSubGroup(InvalidationMsgsGroup * dest,
+								  InvalidationMsgsGroup * src,
 								  int subgroup)
 {
 	/* Messages must be adjacent in main array */
@@ -392,7 +392,7 @@ AppendInvalidationMessageSubGroup(InvalidationMsgsGroup *dest,
  * Add a catcache inval entry
  */
 static void
-AddCatcacheInvalidationMessage(InvalidationMsgsGroup *group,
+AddCatcacheInvalidationMessage(InvalidationMsgsGroup * group,
 							   int id, uint32 hashValue, Oid dbId)
 {
 	SharedInvalidationMessage msg;
@@ -420,7 +420,7 @@ AddCatcacheInvalidationMessage(InvalidationMsgsGroup *group,
  * Add a whole-catalog inval entry
  */
 static void
-AddCatalogInvalidationMessage(InvalidationMsgsGroup *group,
+AddCatalogInvalidationMessage(InvalidationMsgsGroup * group,
 							  Oid dbId, Oid catId)
 {
 	SharedInvalidationMessage msg;
@@ -438,7 +438,7 @@ AddCatalogInvalidationMessage(InvalidationMsgsGroup *group,
  * Add a relcache inval entry
  */
 static void
-AddRelcacheInvalidationMessage(InvalidationMsgsGroup *group,
+AddRelcacheInvalidationMessage(InvalidationMsgsGroup * group,
 							   Oid dbId, Oid relId)
 {
 	SharedInvalidationMessage msg;
@@ -470,7 +470,7 @@ AddRelcacheInvalidationMessage(InvalidationMsgsGroup *group,
  * We put these into the relcache subgroup for simplicity.
  */
 static void
-AddSnapshotInvalidationMessage(InvalidationMsgsGroup *group,
+AddSnapshotInvalidationMessage(InvalidationMsgsGroup * group,
 							   Oid dbId, Oid relId)
 {
 	SharedInvalidationMessage msg;
@@ -497,8 +497,8 @@ AddSnapshotInvalidationMessage(InvalidationMsgsGroup *group,
  * the source group to empty.
  */
 static void
-AppendInvalidationMessages(InvalidationMsgsGroup *dest,
-						   InvalidationMsgsGroup *src)
+AppendInvalidationMessages(InvalidationMsgsGroup * dest,
+						   InvalidationMsgsGroup * src)
 {
 	AppendInvalidationMessageSubGroup(dest, src, CatCacheMsgs);
 	AppendInvalidationMessageSubGroup(dest, src, RelCacheMsgs);
@@ -511,7 +511,7 @@ AppendInvalidationMessages(InvalidationMsgsGroup *dest,
  * catcache entries are processed first, for reasons mentioned above.
  */
 static void
-ProcessInvalidationMessages(InvalidationMsgsGroup *group,
+ProcessInvalidationMessages(InvalidationMsgsGroup * group,
 							void (*func) (SharedInvalidationMessage *msg))
 {
 	ProcessMessageSubGroup(group, CatCacheMsgs, func(msg));
@@ -523,7 +523,7 @@ ProcessInvalidationMessages(InvalidationMsgsGroup *group,
  * rather than just one at a time.
  */
 static void
-ProcessInvalidationMessagesMulti(InvalidationMsgsGroup *group,
+ProcessInvalidationMessagesMulti(InvalidationMsgsGroup * group,
 								 void (*func) (const SharedInvalidationMessage *msgs, int n))
 {
 	ProcessMessageSubGroupMulti(group, CatCacheMsgs, func(msgs, n));
diff --git a/src/bin/pg_amcheck/pg_amcheck.c b/src/bin/pg_amcheck/pg_amcheck.c
index d4a53c8e63..a3c13fa27e 100644
--- a/src/bin/pg_amcheck/pg_amcheck.c
+++ b/src/bin/pg_amcheck/pg_amcheck.c
@@ -1100,17 +1100,17 @@ verify_btree_slot_handler(PGresult *res, PGconn *conn, void *context)
 
 	if (PQresultStatus(res) == PGRES_TUPLES_OK)
 	{
-		int                     ntups = PQntuples(res);
+		int			ntups = PQntuples(res);
 
 		if (ntups > 1)
 		{
 			/*
 			 * We expect the btree checking functions to return one void row
 			 * each, or zero rows if the check was skipped due to the object
-			 * being in the wrong state to be checked, so we should output some
-			 * sort of warning if we get anything more, not because it
-			 * indicates corruption, but because it suggests a mismatch between
-			 * amcheck and pg_amcheck versions.
+			 * being in the wrong state to be checked, so we should output
+			 * some sort of warning if we get anything more, not because it
+			 * indicates corruption, but because it suggests a mismatch
+			 * between amcheck and pg_amcheck versions.
 			 *
 			 * In conjunction with --progress, anything written to stderr at
 			 * this time would present strangely to the user without an extra
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 27ee6394cf..19b7b1d412 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -1888,26 +1888,26 @@ BaseBackup(void)
 	}
 	if (maxrate > 0)
 		AppendIntegerCommandOption(&buf, use_new_option_syntax, "MAX_RATE",
-									  maxrate);
+								   maxrate);
 	if (format == 't')
 		AppendPlainCommandOption(&buf, use_new_option_syntax, "TABLESPACE_MAP");
 	if (!verify_checksums)
 	{
 		if (use_new_option_syntax)
 			AppendIntegerCommandOption(&buf, use_new_option_syntax,
-										  "VERIFY_CHECKSUMS", 0);
+									   "VERIFY_CHECKSUMS", 0);
 		else
 			AppendPlainCommandOption(&buf, use_new_option_syntax,
-										"NOVERIFY_CHECKSUMS");
+									 "NOVERIFY_CHECKSUMS");
 	}
 
 	if (manifest)
 	{
 		AppendStringCommandOption(&buf, use_new_option_syntax, "MANIFEST",
-									 manifest_force_encode ? "force-encode" : "yes");
+								  manifest_force_encode ? "force-encode" : "yes");
 		if (manifest_checksums != NULL)
 			AppendStringCommandOption(&buf, use_new_option_syntax,
-										 "MANIFEST_CHECKSUMS", manifest_checksums);
+									  "MANIFEST_CHECKSUMS", manifest_checksums);
 	}
 
 	if (verbose)
diff --git a/src/bin/pg_basebackup/streamutil.c b/src/bin/pg_basebackup/streamutil.c
index 2a3e0c688f..0690393913 100644
--- a/src/bin/pg_basebackup/streamutil.c
+++ b/src/bin/pg_basebackup/streamutil.c
@@ -625,7 +625,7 @@ CreateReplicationSlot(PGconn *conn, const char *slot_name, const char *plugin,
 			/* pg_recvlogical doesn't use an exported snapshot, so suppress */
 			if (use_new_option_syntax)
 				AppendStringCommandOption(query, use_new_option_syntax,
-										   "SNAPSHOT", "nothing");
+										  "SNAPSHOT", "nothing");
 			else
 				AppendPlainCommandOption(query, use_new_option_syntax,
 										 "NOEXPORT_SNAPSHOT");
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index 7fbbe7022e..7fd1d62d02 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -1748,7 +1748,7 @@ typedef BOOL (WINAPI * __QueryInformationJobObject) (HANDLE, JOBOBJECTINFOCLASS,
  * achieves the goal of postmaster running in a similar environment as pg_ctl.
  */
 static void
-InheritStdHandles(STARTUPINFO* si)
+InheritStdHandles(STARTUPINFO *si)
 {
 	si->dwFlags |= STARTF_USESTDHANDLES;
 	si->hStdInput = GetStdHandle(STD_INPUT_HANDLE);
@@ -1800,8 +1800,8 @@ CreateRestrictedProcess(char *cmd, PROCESS_INFORMATION *processInfo, bool as_ser
 	si.cb = sizeof(si);
 
 	/*
-	 * Set stdin/stdout/stderr handles to be inherited in the child
-	 * process. That allows postmaster and the processes it starts to perform
+	 * Set stdin/stdout/stderr handles to be inherited in the child process.
+	 * That allows postmaster and the processes it starts to perform
 	 * additional checks to see if running in a service (otherwise they get
 	 * the default console handles - which point to "somewhere").
 	 */
diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c
index ecab0a9e4e..3e091a754e 100644
--- a/src/bin/pg_dump/common.c
+++ b/src/bin/pg_dump/common.c
@@ -59,7 +59,7 @@ typedef struct _catalogIdMapEntry
 	uint32		hashval;		/* hash code for the CatalogId */
 	DumpableObject *dobj;		/* the associated DumpableObject, if any */
 	ExtensionInfo *ext;			/* owning extension, if any */
-} CatalogIdMapEntry;
+}			CatalogIdMapEntry;
 
 #define SH_PREFIX		catalogid
 #define SH_ELEMENT_TYPE	CatalogIdMapEntry
@@ -77,7 +77,7 @@ typedef struct _catalogIdMapEntry
 
 #define CATALOGIDHASH_INITIAL_SIZE	10000
 
-static catalogid_hash *catalogIdHash = NULL;
+static catalogid_hash * catalogIdHash = NULL;
 
 static void flagInhTables(Archive *fout, TableInfo *tbinfo, int numTables,
 						  InhInfo *inhinfo, int numInherits);
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index d17f69333f..2e34516d3e 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -4158,7 +4158,7 @@ initGenerateDataClientSide(PGconn *con)
 	PGresult   *res;
 	int			i;
 	int64		k;
-	char		*copy_statement;
+	char	   *copy_statement;
 
 	/* used to track elapsed time and estimate of the remaining time */
 	pg_time_usec_t start;
diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h
index fef9945ed8..6d609819a7 100644
--- a/src/include/catalog/pg_class.h
+++ b/src/include/catalog/pg_class.h
@@ -198,7 +198,7 @@ DECLARE_INDEX(pg_class_tblspc_relfilenode_index, 3455, ClassTblspcRelfilenodeInd
 	 (relkind) == RELKIND_TOASTVALUE || \
 	 (relkind) == RELKIND_MATVIEW)
 
-extern int errdetail_relkind_not_supported(char relkind);
+extern int	errdetail_relkind_not_supported(char relkind);
 
 #endif							/* EXPOSE_TO_CLIENT_CODE */
 
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 7c657c1241..4cd0004a11 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -670,7 +670,8 @@ extern bool equal(const void *a, const void *b);
  */
 typedef double Selectivity;		/* fraction of tuples a qualifier will pass */
 typedef double Cost;			/* execution cost (in page-access units) */
-typedef double Cardinality;		/* (estimated) number of rows or other integer count */
+typedef double Cardinality;		/* (estimated) number of rows or other integer
+								 * count */
 
 
 /*
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 49123e28a4..8e8efc08ea 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -295,6 +295,7 @@ typedef struct A_Expr
 typedef struct A_Const
 {
 	NodeTag		type;
+
 	/*
 	 * Value nodes are inline for performance.  You can treat 'val' as a node,
 	 * as in IsA(&val, Integer).  'val' is not valid if isnull is true.
@@ -756,7 +757,8 @@ typedef struct DefElem
 	NodeTag		type;
 	char	   *defnamespace;	/* NULL if unqualified name */
 	char	   *defname;
-	Node	   *arg;			/* typically Integer, Float, String, or TypeName */
+	Node	   *arg;			/* typically Integer, Float, String, or
+								 * TypeName */
 	DefElemAction defaction;	/* unspecified action, or SET/ADD/DROP */
 	int			location;		/* token location, or -1 if unknown */
 } DefElem;
@@ -1144,7 +1146,7 @@ typedef struct RangeTblEntry
 	 * Fields valid for ENR RTEs (else NULL/zero):
 	 */
 	char	   *enrname;		/* name of ephemeral named relation */
-	Cardinality	enrtuples;		/* estimated or actual from caller */
+	Cardinality enrtuples;		/* estimated or actual from caller */
 
 	/*
 	 * Fields valid in all RTEs:
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 2a53a6e344..66389185cf 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -334,11 +334,11 @@ struct PlannerInfo
 
 	MemoryContext planner_cxt;	/* context holding PlannerInfo */
 
-	Cardinality	total_table_pages;	/* # of pages in all non-dummy tables of
+	Cardinality total_table_pages;	/* # of pages in all non-dummy tables of
 									 * query */
 
-	Selectivity	tuple_fraction; /* tuple_fraction passed to query_planner */
-	Cardinality	limit_tuples;	/* limit_tuples passed to query_planner */
+	Selectivity tuple_fraction; /* tuple_fraction passed to query_planner */
+	Cardinality limit_tuples;	/* limit_tuples passed to query_planner */
 
 	Index		qual_security_level;	/* minimum security_level for quals */
 	/* Note: qual_security_level is zero if there are no securityQuals */
@@ -681,7 +681,7 @@ typedef struct RelOptInfo
 	Relids		relids;			/* set of base relids (rangetable indexes) */
 
 	/* size estimates generated by planner */
-	Cardinality	rows;			/* estimated number of result tuples */
+	Cardinality rows;			/* estimated number of result tuples */
 
 	/* per-relation planner control flags */
 	bool		consider_startup;	/* keep cheap-startup-cost paths? */
@@ -716,9 +716,10 @@ typedef struct RelOptInfo
 	List	   *lateral_vars;	/* LATERAL Vars and PHVs referenced by rel */
 	Relids		lateral_referencers;	/* rels that reference me laterally */
 	List	   *indexlist;		/* list of IndexOptInfo */
+	List	   *partIndexlist;	/* list of IndexOptInfo */
 	List	   *statlist;		/* list of StatisticExtInfo */
 	BlockNumber pages;			/* size estimates derived from pg_class */
-	Cardinality	tuples;
+	Cardinality tuples;
 	double		allvisfrac;
 	Bitmapset  *eclass_indexes; /* Indexes in PlannerInfo's eq_classes list of
 								 * ECs that mention this rel */
@@ -841,7 +842,7 @@ struct IndexOptInfo
 
 	/* index-size statistics (from pg_class and elsewhere) */
 	BlockNumber pages;			/* number of disk pages in index */
-	Cardinality	tuples;			/* number of index tuples in index */
+	Cardinality tuples;			/* number of index tuples in index */
 	int			tree_height;	/* index tree height, or -1 if unknown */
 
 	/* index descriptor information */
@@ -1139,7 +1140,7 @@ typedef struct ParamPathInfo
 	NodeTag		type;
 
 	Relids		ppi_req_outer;	/* rels supplying parameters used by path */
-	Cardinality	ppi_rows;		/* estimated number of result tuples */
+	Cardinality ppi_rows;		/* estimated number of result tuples */
 	List	   *ppi_clauses;	/* join clauses available from outer rels */
 } ParamPathInfo;
 
@@ -1189,7 +1190,7 @@ typedef struct Path
 	int			parallel_workers;	/* desired # of workers; 0 = not parallel */
 
 	/* estimated size/costs for path (see costsize.c for more info) */
-	Cardinality	rows;			/* estimated number of result tuples */
+	Cardinality rows;			/* estimated number of result tuples */
 	Cost		startup_cost;	/* cost expended before fetching any tuples */
 	Cost		total_cost;		/* total cost (assuming all tuples fetched) */
 
@@ -1452,7 +1453,7 @@ typedef struct AppendPath
 	List	   *subpaths;		/* list of component Paths */
 	/* Index of first partial path in subpaths; list_length(subpaths) if none */
 	int			first_partial_path;
-	Cardinality	limit_tuples;	/* hard limit on output tuples, or -1 */
+	Cardinality limit_tuples;	/* hard limit on output tuples, or -1 */
 } AppendPath;
 
 #define IS_DUMMY_APPEND(p) \
@@ -1474,7 +1475,7 @@ typedef struct MergeAppendPath
 {
 	Path		path;
 	List	   *subpaths;		/* list of component Paths */
-	Cardinality	limit_tuples;	/* hard limit on output tuples, or -1 */
+	Cardinality limit_tuples;	/* hard limit on output tuples, or -1 */
 } MergeAppendPath;
 
 /*
@@ -1515,7 +1516,7 @@ typedef struct MemoizePath
 	List	   *param_exprs;	/* cache keys */
 	bool		singlerow;		/* true if the cache entry is to be marked as
 								 * complete after caching the first record. */
-	Cardinality	calls;			/* expected number of rescans */
+	Cardinality calls;			/* expected number of rescans */
 	uint32		est_entries;	/* The maximum number of entries that the
 								 * planner expects will fit in the cache, or 0
 								 * if unknown */
@@ -1667,7 +1668,7 @@ typedef struct HashPath
 	JoinPath	jpath;
 	List	   *path_hashclauses;	/* join clauses used for hashing */
 	int			num_batches;	/* number of batches expected */
-	Cardinality	inner_rows_total;	/* total inner rows expected */
+	Cardinality inner_rows_total;	/* total inner rows expected */
 } HashPath;
 
 /*
@@ -1770,7 +1771,7 @@ typedef struct AggPath
 	Path	   *subpath;		/* path representing input source */
 	AggStrategy aggstrategy;	/* basic strategy, see nodes.h */
 	AggSplit	aggsplit;		/* agg-splitting mode, see nodes.h */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 	uint64		transitionSpace;	/* for pass-by-ref transition data */
 	List	   *groupClause;	/* a list of SortGroupClause's */
 	List	   *qual;			/* quals (HAVING quals), if any */
@@ -1784,7 +1785,7 @@ typedef struct GroupingSetData
 {
 	NodeTag		type;
 	List	   *set;			/* grouping set as list of sortgrouprefs */
-	Cardinality	numGroups;		/* est. number of result groups */
+	Cardinality numGroups;		/* est. number of result groups */
 } GroupingSetData;
 
 typedef struct RollupData
@@ -1793,7 +1794,7 @@ typedef struct RollupData
 	List	   *groupClause;	/* applicable subset of parse->groupClause */
 	List	   *gsets;			/* lists of integer indexes into groupClause */
 	List	   *gsets_data;		/* list of GroupingSetData */
-	Cardinality	numGroups;		/* est. number of result groups */
+	Cardinality numGroups;		/* est. number of result groups */
 	bool		hashable;		/* can be hashed */
 	bool		is_hashed;		/* to be implemented as a hashagg */
 } RollupData;
@@ -1844,7 +1845,7 @@ typedef struct SetOpPath
 	List	   *distinctList;	/* SortGroupClauses identifying target cols */
 	AttrNumber	flagColIdx;		/* where is the flag column, if any */
 	int			firstFlag;		/* flag value for first input relation */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 } SetOpPath;
 
 /*
@@ -1857,7 +1858,7 @@ typedef struct RecursiveUnionPath
 	Path	   *rightpath;
 	List	   *distinctList;	/* SortGroupClauses identifying target cols */
 	int			wtParam;		/* ID of Param representing work table */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 } RecursiveUnionPath;
 
 /*
@@ -2612,7 +2613,7 @@ typedef struct
 typedef struct
 {
 	bool		limit_needed;
-	Cardinality	limit_tuples;
+	Cardinality limit_tuples;
 	int64		count_est;
 	int64		offset_est;
 } FinalPathExtraData;
@@ -2643,15 +2644,15 @@ typedef struct JoinCostWorkspace
 	Cost		inner_rescan_run_cost;
 
 	/* private for cost_mergejoin code */
-	Cardinality	outer_rows;
-	Cardinality	inner_rows;
-	Cardinality	outer_skip_rows;
-	Cardinality	inner_skip_rows;
+	Cardinality outer_rows;
+	Cardinality inner_rows;
+	Cardinality outer_skip_rows;
+	Cardinality inner_skip_rows;
 
 	/* private for cost_hashjoin code */
 	int			numbuckets;
 	int			numbatches;
-	Cardinality	inner_rows_total;
+	Cardinality inner_rows_total;
 } JoinCostWorkspace;
 
 /*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 01a246d50e..f0e15e4907 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -120,7 +120,7 @@ typedef struct Plan
 	/*
 	 * planner's estimate of result size of this plan step
 	 */
-	Cardinality	plan_rows;		/* number of rows plan is expected to emit */
+	Cardinality plan_rows;		/* number of rows plan is expected to emit */
 	int			plan_width;		/* average row width in bytes */
 
 	/*
@@ -976,7 +976,7 @@ typedef struct Hash
 	AttrNumber	skewColumn;		/* outer join key's column #, or zero */
 	bool		skewInherit;	/* is outer join rel an inheritance tree? */
 	/* all other info is in the parent HashJoin node */
-	Cardinality	rows_total;		/* estimate total rows if parallel_aware */
+	Cardinality rows_total;		/* estimate total rows if parallel_aware */
 } Hash;
 
 /* ----------------
diff --git a/src/include/nodes/value.h b/src/include/nodes/value.h
index 8b71b510eb..67b5f66b58 100644
--- a/src/include/nodes/value.h
+++ b/src/include/nodes/value.h
@@ -29,7 +29,7 @@ typedef struct Integer
 {
 	NodeTag		type;
 	int			val;
-} Integer;
+}			Integer;
 
 /*
  * Float is internally represented as string.  Using T_Float as the node type
@@ -46,27 +46,27 @@ typedef struct Float
 {
 	NodeTag		type;
 	char	   *val;
-} Float;
+}			Float;
 
 typedef struct String
 {
 	NodeTag		type;
 	char	   *val;
-} String;
+}			String;
 
 typedef struct BitString
 {
 	NodeTag		type;
 	char	   *val;
-} BitString;
+}			BitString;
 
 #define intVal(v)		(castNode(Integer, v)->val)
 #define floatVal(v)		atof(castNode(Float, v)->val)
 #define strVal(v)		(castNode(String, v)->val)
 
-extern Integer *makeInteger(int i);
-extern Float *makeFloat(char *numericStr);
-extern String *makeString(char *str);
-extern BitString *makeBitString(char *str);
+extern Integer * makeInteger(int i);
+extern Float * makeFloat(char *numericStr);
+extern String * makeString(char *str);
+extern BitString * makeBitString(char *str);
 
 #endif							/* VALUE_H */
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index bcd3588ea2..17785f7e57 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -186,8 +186,8 @@ typedef struct PgStat_TableXactStatus
 	PgStat_Counter tuples_inserted; /* tuples inserted in (sub)xact */
 	PgStat_Counter tuples_updated;	/* tuples updated in (sub)xact */
 	PgStat_Counter tuples_deleted;	/* tuples deleted in (sub)xact */
-	bool		truncdropped;		/* relation truncated/dropped in this
-									 * (sub)xact */
+	bool		truncdropped;	/* relation truncated/dropped in this
+								 * (sub)xact */
 	/* tuples i/u/d prior to truncate/drop */
 	PgStat_Counter inserted_pre_truncdrop;
 	PgStat_Counter updated_pre_truncdrop;
@@ -477,7 +477,7 @@ typedef struct PgStat_MsgCheckpointer
 	PgStat_Counter m_buf_fsync_backend;
 	PgStat_Counter m_checkpoint_write_time; /* times in milliseconds */
 	PgStat_Counter m_checkpoint_sync_time;
-} PgStat_MsgCheckpointer;
+}			PgStat_MsgCheckpointer;
 
 /* ----------
  * PgStat_MsgWal			Sent by backends and background processes to update WAL statistics.
@@ -853,7 +853,7 @@ typedef struct PgStat_BgWriterStats
 	PgStat_Counter maxwritten_clean;
 	PgStat_Counter buf_alloc;
 	TimestampTz stat_reset_timestamp;
-} PgStat_BgWriterStats;
+}			PgStat_BgWriterStats;
 
 /*
  * Checkpointer statistics kept in the stats collector
@@ -868,7 +868,7 @@ typedef struct PgStat_CheckpointerStats
 	PgStat_Counter buf_written_checkpoints;
 	PgStat_Counter buf_written_backend;
 	PgStat_Counter buf_fsync_backend;
-} PgStat_CheckpointerStats;
+}			PgStat_CheckpointerStats;
 
 /*
  * Global statistics kept in the stats collector
@@ -1130,8 +1130,8 @@ extern PgStat_StatDBEntry *pgstat_fetch_stat_dbentry(Oid dbid);
 extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid);
 extern PgStat_StatFuncEntry *pgstat_fetch_stat_funcentry(Oid funcid);
 extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
-extern PgStat_BgWriterStats *pgstat_fetch_stat_bgwriter(void);
-extern PgStat_CheckpointerStats *pgstat_fetch_stat_checkpointer(void);
+extern PgStat_BgWriterStats * pgstat_fetch_stat_bgwriter(void);
+extern PgStat_CheckpointerStats * pgstat_fetch_stat_checkpointer(void);
 extern PgStat_GlobalStats *pgstat_fetch_global(void);
 extern PgStat_WalStats *pgstat_fetch_stat_wal(void);
 extern PgStat_SLRUStats *pgstat_fetch_slru(void);
diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h
index 2fb208bdb5..db7fb0db7f 100644
--- a/src/include/postmaster/startup.h
+++ b/src/include/postmaster/startup.h
@@ -23,7 +23,7 @@
 			ereport(LOG, errmsg(msg, secs, (usecs / 10000),  __VA_ARGS__ )); \
 	} while(0)
 
-extern int  log_startup_progress_interval;
+extern int	log_startup_progress_interval;
 
 extern void HandleStartupProcInterrupts(void);
 extern void StartupProcessMain(void) pg_attribute_noreturn();
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index 27f7525b3e..01d6b26759 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -459,6 +459,19 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
  550 |     | 
 (12 rows)
 
+-- join pruning
+CREATE UNIQUE INDEX prt1_p1_a_idx ON prt1 (a);
+EXPLAIN (COSTS OFF)
+SELECT t1.a FROM prt1 t1 LEFT JOIN prt1 USING (a);
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on prt1_p1 t1_1
+   ->  Seq Scan on prt1_p2 t1_2
+   ->  Seq Scan on prt1_p3 t1_3
+(4 rows)
+
+DROP INDEX prt1_p1_a_idx;
 -- bug with inadequate sort key representation
 SET enable_partitionwise_aggregate TO true;
 SET enable_hashjoin TO false;
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index d97b5b69ff..e39c05231a 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -90,6 +90,13 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
 SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
 			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
+-- join pruning
+CREATE UNIQUE INDEX prt1_p1_a_idx ON prt1 (a);
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a FROM prt1 t1 LEFT JOIN prt1 USING (a);
+
+DROP INDEX prt1_p1_a_idx;
 
 -- bug with inadequate sort key representation
 SET enable_partitionwise_aggregate TO true;

Julien Rouhaud

rjuju123@gmail.com

almost 4 years ago

In reply to: Arne Roland (#2)

Re: missing indexes in indexlist with partitioned tables

Hi,

On Thu, Oct 28, 2021 at 01:44:31PM +0000, Arne Roland wrote:

The attached patch takes that route. I'd appreciate feedback!

The cfbot reports that the patch doesn't apply anymore:

http://cfbot.cputube.org/patch_36_3452.log
=== Applying patches on top of PostgreSQL commit ID 025b920a3d45fed441a0a58fdcdf05b321b1eead ===
=== applying patch ./partIndexlistClean.patch
patching file src/backend/access/heap/vacuumlazy.c
Hunk #1 FAILED at 2375.
1 out of 1 hunk FAILED -- saving rejects to file src/backend/access/heap/vacuumlazy.c.rej
patching file src/backend/access/transam/xlog.c
Hunk #1 succeeded at 911 with fuzz 1 (offset 5 lines).
Hunk #2 FAILED at 5753.
[...]
1 out of 6 hunks FAILED -- saving rejects to file src/backend/access/transam/xlog.c.rej
[...]
patching file src/backend/commands/publicationcmds.c
Hunk #1 FAILED at 813.
1 out of 1 hunk FAILED -- saving rejects to file src/backend/commands/publicationcmds.c.rej
patching file src/include/nodes/pathnodes.h
Hunk #9 FAILED at 1516.
[...]
1 out of 17 hunks FAILED -- saving rejects to file src/include/nodes/pathnodes.h.rej

Could you send a rebased version? In the meantime I will switch the cf entry
to Waiting on Author.

A.Roland@index.de

almost 4 years ago

In reply to: Julien Rouhaud (#3)

1 attachment(s)

Re: missing indexes in indexlist with partitioned tables

Hi,

thank you for the heads up! Those files ended up accidentally in the dump from me running pg_indent. The file count was truly excessive, I should have noticed this sooner.
I attached the patch without the excessive modifications. That should be way easier to read.

Regards
Arne

Attachments:

0002-partIndexlistClean.patchtext/x-patch; name=0002-partIndexlistClean.patchDownload

diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 0ef70ad7f1..fe50919637 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -3504,7 +3504,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 	Assert(list_length(exprlist) == list_length(oprlist));
 
 	/* Short-circuit if no indexes... */
-	if (rel->indexlist == NIL)
+	if (rel->indexlist == NIL && rel->partIndexlist == NIL)
 		return false;
 
 	/*
@@ -3549,7 +3549,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 		return false;
 
 	/* Examine each index of the relation ... */
-	foreach(ic, rel->indexlist)
+	foreach(ic, list_concat(rel->indexlist, rel->partIndexlist))
 	{
 		IndexOptInfo *ind = (IndexOptInfo *) lfirst(ic);
 		int			c;
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 337f470d58..b04b3f88ad 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -23,6 +23,7 @@
 #include "postgres.h"
 
 #include "nodes/nodeFuncs.h"
+#include "nodes/nodes.h"
 #include "optimizer/clauses.h"
 #include "optimizer/joininfo.h"
 #include "optimizer/optimizer.h"
@@ -598,7 +599,7 @@ rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
 		 */
 		ListCell   *lc;
 
-		foreach(lc, rel->indexlist)
+		foreach(lc, list_concat(rel->indexlist, rel->partIndexlist))
 		{
 			IndexOptInfo *ind = (IndexOptInfo *) lfirst(lc);
 
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 535fa041ad..8035616c35 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -117,6 +117,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	Relation	relation;
 	bool		hasindex;
 	List	   *indexinfos = NIL;
+	List	   *partIndexinfos = NIL;
 
 	/*
 	 * We need not lock the relation since it was already locked, either by
@@ -163,7 +164,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	else
 		hasindex = relation->rd_rel->relhasindex;
 
-	if (hasindex)
+	if (!(IgnoreSystemIndexes && !IsSystemRelation(relation)) && relation->rd_rel->relhasindex)
 	{
 		List	   *indexoidlist;
 		LOCKMODE	lmode;
@@ -212,10 +213,12 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			}
 
 			/*
-			 * Ignore partitioned indexes, since they are not usable for
-			 * queries.
+			 * Don't add partitioned indexes to the indexlist, since they are
+			 * not usable by the executor. If they are unique add them to the
+			 * partindexlist instead, to use for further pruning. If they
+			 * aren't that either, simply skip them.
 			 */
-			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			if (inhparent && (!index->indisunique || indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX))
 			{
 				index_close(indexRelation, NoLock);
 				continue;
@@ -263,7 +266,40 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->indexcollations[i] = indexRelation->rd_indcollation[i];
 			}
 
-			info->relam = indexRelation->rd_rel->relam;
+			/*
+			 * Fetch the index expressions and predicate, if any.  We must
+			 * modify the copies we obtain from the relcache to have the
+			 * correct varno for the parent relation, so that they match up
+			 * correctly against qual clauses.
+			 */
+			info->indexprs = RelationGetIndexExpressions(indexRelation);
+			info->indpred = RelationGetIndexPredicate(indexRelation);
+			if (info->indexprs && varno != 1)
+				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
+			if (info->indpred && varno != 1)
+				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
+
+			info->unique = index->indisunique;
+			info->immediate = index->indimmediate;
+
+			/*
+			 * Don't add partitioned indexes to the indexlist, add them to the
+			 * partindexlist instead, since they are not usable by the
+			 * executor.
+			 */
+			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			{
+				index_close(indexRelation, NoLock);
+				partIndexinfos = lappend(partIndexinfos, info);
+				continue;
+			}
+
+			info->hypothetical = false;
+			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
+			info->predOK = false;	/* set later, in indxpath.c */
+
+			/* Build targetlist using the completed indexprs data */
+			info->indextlist = build_index_tlist(root, info, relation);
 
 			/* We copy just the fields we need, not all of rd_indam */
 			amroutine = indexRelation->rd_indam;
@@ -283,6 +319,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			/* Fetch index opclass options */
 			info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
 
+			info->relam = indexRelation->rd_rel->relam;
+
 			/*
 			 * Fetch the ordering information for the index, if any.
 			 */
@@ -369,28 +407,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->nulls_first = NULL;
 			}
 
-			/*
-			 * Fetch the index expressions and predicate, if any.  We must
-			 * modify the copies we obtain from the relcache to have the
-			 * correct varno for the parent relation, so that they match up
-			 * correctly against qual clauses.
-			 */
-			info->indexprs = RelationGetIndexExpressions(indexRelation);
-			info->indpred = RelationGetIndexPredicate(indexRelation);
-			if (info->indexprs && varno != 1)
-				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
-			if (info->indpred && varno != 1)
-				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
-
-			/* Build targetlist using the completed indexprs data */
-			info->indextlist = build_index_tlist(root, info, relation);
-
-			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
-			info->predOK = false;	/* set later, in indxpath.c */
-			info->unique = index->indisunique;
-			info->immediate = index->indimmediate;
-			info->hypothetical = false;
-
 			/*
 			 * Estimate the index size.  If it's not a partial index, we lock
 			 * the number-of-tuples estimate to equal the parent table; if it
@@ -440,6 +456,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	}
 
 	rel->indexlist = indexinfos;
+	rel->partIndexlist = partIndexinfos;
 
 	rel->statlist = get_relation_statistics(rel, relation);
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 1f33fe13c1..f4d8bd9e4a 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -334,11 +334,11 @@ struct PlannerInfo
 
 	MemoryContext planner_cxt;	/* context holding PlannerInfo */
 
-	Cardinality	total_table_pages;	/* # of pages in all non-dummy tables of
+	Cardinality total_table_pages;	/* # of pages in all non-dummy tables of
 									 * query */
 
-	Selectivity	tuple_fraction; /* tuple_fraction passed to query_planner */
-	Cardinality	limit_tuples;	/* limit_tuples passed to query_planner */
+	Selectivity tuple_fraction; /* tuple_fraction passed to query_planner */
+	Cardinality limit_tuples;	/* limit_tuples passed to query_planner */
 
 	Index		qual_security_level;	/* minimum security_level for quals */
 	/* Note: qual_security_level is zero if there are no securityQuals */
@@ -681,7 +681,7 @@ typedef struct RelOptInfo
 	Relids		relids;			/* set of base relids (rangetable indexes) */
 
 	/* size estimates generated by planner */
-	Cardinality	rows;			/* estimated number of result tuples */
+	Cardinality rows;			/* estimated number of result tuples */
 
 	/* per-relation planner control flags */
 	bool		consider_startup;	/* keep cheap-startup-cost paths? */
@@ -716,9 +716,10 @@ typedef struct RelOptInfo
 	List	   *lateral_vars;	/* LATERAL Vars and PHVs referenced by rel */
 	Relids		lateral_referencers;	/* rels that reference me laterally */
 	List	   *indexlist;		/* list of IndexOptInfo */
+	List	   *partIndexlist;	/* list of IndexOptInfo */
 	List	   *statlist;		/* list of StatisticExtInfo */
 	BlockNumber pages;			/* size estimates derived from pg_class */
-	Cardinality	tuples;
+	Cardinality tuples;
 	double		allvisfrac;
 	Bitmapset  *eclass_indexes; /* Indexes in PlannerInfo's eq_classes list of
 								 * ECs that mention this rel */
@@ -841,7 +842,7 @@ struct IndexOptInfo
 
 	/* index-size statistics (from pg_class and elsewhere) */
 	BlockNumber pages;			/* number of disk pages in index */
-	Cardinality	tuples;			/* number of index tuples in index */
+	Cardinality tuples;			/* number of index tuples in index */
 	int			tree_height;	/* index tree height, or -1 if unknown */
 
 	/* index descriptor information */
@@ -1139,7 +1140,7 @@ typedef struct ParamPathInfo
 	NodeTag		type;
 
 	Relids		ppi_req_outer;	/* rels supplying parameters used by path */
-	Cardinality	ppi_rows;		/* estimated number of result tuples */
+	Cardinality ppi_rows;		/* estimated number of result tuples */
 	List	   *ppi_clauses;	/* join clauses available from outer rels */
 } ParamPathInfo;
 
@@ -1189,7 +1190,7 @@ typedef struct Path
 	int			parallel_workers;	/* desired # of workers; 0 = not parallel */
 
 	/* estimated size/costs for path (see costsize.c for more info) */
-	Cardinality	rows;			/* estimated number of result tuples */
+	Cardinality rows;			/* estimated number of result tuples */
 	Cost		startup_cost;	/* cost expended before fetching any tuples */
 	Cost		total_cost;		/* total cost (assuming all tuples fetched) */
 
@@ -1452,7 +1453,7 @@ typedef struct AppendPath
 	List	   *subpaths;		/* list of component Paths */
 	/* Index of first partial path in subpaths; list_length(subpaths) if none */
 	int			first_partial_path;
-	Cardinality	limit_tuples;	/* hard limit on output tuples, or -1 */
+	Cardinality limit_tuples;	/* hard limit on output tuples, or -1 */
 } AppendPath;
 
 #define IS_DUMMY_APPEND(p) \
@@ -1474,7 +1475,7 @@ typedef struct MergeAppendPath
 {
 	Path		path;
 	List	   *subpaths;		/* list of component Paths */
-	Cardinality	limit_tuples;	/* hard limit on output tuples, or -1 */
+	Cardinality limit_tuples;	/* hard limit on output tuples, or -1 */
 } MergeAppendPath;
 
 /*
@@ -1772,7 +1773,7 @@ typedef struct AggPath
 	Path	   *subpath;		/* path representing input source */
 	AggStrategy aggstrategy;	/* basic strategy, see nodes.h */
 	AggSplit	aggsplit;		/* agg-splitting mode, see nodes.h */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 	uint64		transitionSpace;	/* for pass-by-ref transition data */
 	List	   *groupClause;	/* a list of SortGroupClause's */
 	List	   *qual;			/* quals (HAVING quals), if any */
@@ -1786,7 +1787,7 @@ typedef struct GroupingSetData
 {
 	NodeTag		type;
 	List	   *set;			/* grouping set as list of sortgrouprefs */
-	Cardinality	numGroups;		/* est. number of result groups */
+	Cardinality numGroups;		/* est. number of result groups */
 } GroupingSetData;
 
 typedef struct RollupData
@@ -1795,7 +1796,7 @@ typedef struct RollupData
 	List	   *groupClause;	/* applicable subset of parse->groupClause */
 	List	   *gsets;			/* lists of integer indexes into groupClause */
 	List	   *gsets_data;		/* list of GroupingSetData */
-	Cardinality	numGroups;		/* est. number of result groups */
+	Cardinality numGroups;		/* est. number of result groups */
 	bool		hashable;		/* can be hashed */
 	bool		is_hashed;		/* to be implemented as a hashagg */
 } RollupData;
@@ -1846,7 +1847,7 @@ typedef struct SetOpPath
 	List	   *distinctList;	/* SortGroupClauses identifying target cols */
 	AttrNumber	flagColIdx;		/* where is the flag column, if any */
 	int			firstFlag;		/* flag value for first input relation */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 } SetOpPath;
 
 /*
@@ -1859,7 +1860,7 @@ typedef struct RecursiveUnionPath
 	Path	   *rightpath;
 	List	   *distinctList;	/* SortGroupClauses identifying target cols */
 	int			wtParam;		/* ID of Param representing work table */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 } RecursiveUnionPath;
 
 /*
@@ -2615,7 +2616,7 @@ typedef struct
 typedef struct
 {
 	bool		limit_needed;
-	Cardinality	limit_tuples;
+	Cardinality limit_tuples;
 	int64		count_est;
 	int64		offset_est;
 } FinalPathExtraData;
@@ -2646,15 +2647,15 @@ typedef struct JoinCostWorkspace
 	Cost		inner_rescan_run_cost;
 
 	/* private for cost_mergejoin code */
-	Cardinality	outer_rows;
-	Cardinality	inner_rows;
-	Cardinality	outer_skip_rows;
-	Cardinality	inner_skip_rows;
+	Cardinality outer_rows;
+	Cardinality inner_rows;
+	Cardinality outer_skip_rows;
+	Cardinality inner_skip_rows;
 
 	/* private for cost_hashjoin code */
 	int			numbuckets;
 	int			numbatches;
-	Cardinality	inner_rows_total;
+	Cardinality inner_rows_total;
 } JoinCostWorkspace;
 
 /*
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index bb5b7c47a4..1cc7f808cd 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -459,6 +459,19 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
  550 |     | 
 (12 rows)
 
+-- join pruning
+CREATE UNIQUE INDEX prt1_p1_a_idx ON prt1 (a);
+EXPLAIN (COSTS OFF)
+SELECT t1.a FROM prt1 t1 LEFT JOIN prt1 USING (a);
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on prt1_p1 t1_1
+   ->  Seq Scan on prt1_p2 t1_2
+   ->  Seq Scan on prt1_p3 t1_3
+(4 rows)
+
+DROP INDEX prt1_p1_a_idx;
 -- bug with inadequate sort key representation
 SET enable_partitionwise_aggregate TO true;
 SET enable_hashjoin TO false;
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 67f506361f..5f70d5d851 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -90,6 +90,13 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
 SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
 			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
+-- join pruning
+CREATE UNIQUE INDEX prt1_p1_a_idx ON prt1 (a);
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a FROM prt1 t1 LEFT JOIN prt1 USING (a);
+
+DROP INDEX prt1_p1_a_idx;
 
 -- bug with inadequate sort key representation
 SET enable_partitionwise_aggregate TO true;

A.Roland@index.de

almost 4 years ago

In reply to: Arne Roland (#4)

1 attachment(s)

Re: missing indexes in indexlist with partitioned tables

Using some valuable feedback from Zhihong Yu, I fixed a flipped negation error and updated the comments.

Regards
Arne

________________________________
From: Arne Roland
Sent: Monday, January 17, 2022 12:25
To: Julien Rouhaud
Cc: pgsql-hackers
Subject: Re: missing indexes in indexlist with partitioned tables

Hi,

Regards
Arne

Attachments:

0003-partIndexlistClean.patchtext/x-patch; name=0003-partIndexlistClean.patchDownload

diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 0ef70ad7f1..fe50919637 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -3504,7 +3504,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 	Assert(list_length(exprlist) == list_length(oprlist));
 
 	/* Short-circuit if no indexes... */
-	if (rel->indexlist == NIL)
+	if (rel->indexlist == NIL && rel->partIndexlist == NIL)
 		return false;
 
 	/*
@@ -3549,7 +3549,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 		return false;
 
 	/* Examine each index of the relation ... */
-	foreach(ic, rel->indexlist)
+	foreach(ic, list_concat(rel->indexlist, rel->partIndexlist))
 	{
 		IndexOptInfo *ind = (IndexOptInfo *) lfirst(ic);
 		int			c;
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 337f470d58..b04b3f88ad 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -23,6 +23,7 @@
 #include "postgres.h"
 
 #include "nodes/nodeFuncs.h"
+#include "nodes/nodes.h"
 #include "optimizer/clauses.h"
 #include "optimizer/joininfo.h"
 #include "optimizer/optimizer.h"
@@ -598,7 +599,7 @@ rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
 		 */
 		ListCell   *lc;
 
-		foreach(lc, rel->indexlist)
+		foreach(lc, list_concat(rel->indexlist, rel->partIndexlist))
 		{
 			IndexOptInfo *ind = (IndexOptInfo *) lfirst(lc);
 
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 535fa041ad..5707d48dbd 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -115,8 +115,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 {
 	Index		varno = rel->relid;
 	Relation	relation;
-	bool		hasindex;
 	List	   *indexinfos = NIL;
+	List	   *partIndexinfos = NIL;
 
 	/*
 	 * We need not lock the relation since it was already locked, either by
@@ -153,17 +153,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	/* Retrieve the parallel_workers reloption, or -1 if not set. */
 	rel->rel_parallel_workers = RelationGetParallelWorkers(relation, -1);
 
-	/*
-	 * Make list of indexes.  Ignore indexes on system catalogs if told to.
-	 * Don't bother with indexes for an inheritance parent, either.
-	 */
-	if (inhparent ||
-		(IgnoreSystemIndexes && IsSystemRelation(relation)))
-		hasindex = false;
-	else
-		hasindex = relation->rd_rel->relhasindex;
-
-	if (hasindex)
+	/* Make list of indexes. Ignore indexes on system catalogs if told to. */
+	if (!(IgnoreSystemIndexes && IsSystemRelation(relation)) && relation->rd_rel->relhasindex)
 	{
 		List	   *indexoidlist;
 		LOCKMODE	lmode;
@@ -212,10 +203,12 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			}
 
 			/*
-			 * Ignore partitioned indexes, since they are not usable for
-			 * queries.
+			 * Don't add partitioned indexes to the indexlist, since they are
+			 * not usable by the executor. If they are unique add them to the
+			 * partindexlist instead, to use for further pruning. If they
+			 * aren't that either, simply skip them.
 			 */
-			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			if (inhparent && (!index->indisunique || indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX))
 			{
 				index_close(indexRelation, NoLock);
 				continue;
@@ -263,7 +256,40 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->indexcollations[i] = indexRelation->rd_indcollation[i];
 			}
 
-			info->relam = indexRelation->rd_rel->relam;
+			/*
+			 * Fetch the index expressions and predicate, if any.  We must
+			 * modify the copies we obtain from the relcache to have the
+			 * correct varno for the parent relation, so that they match up
+			 * correctly against qual clauses.
+			 */
+			info->indexprs = RelationGetIndexExpressions(indexRelation);
+			info->indpred = RelationGetIndexPredicate(indexRelation);
+			if (info->indexprs && varno != 1)
+				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
+			if (info->indpred && varno != 1)
+				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
+
+			info->unique = index->indisunique;
+			info->immediate = index->indimmediate;
+
+			/*
+			 * Don't add partitioned indexes to the indexlist, add them to the
+			 * partindexlist instead, since they are not usable by the
+			 * executor.
+			 */
+			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			{
+				index_close(indexRelation, NoLock);
+				partIndexinfos = lappend(partIndexinfos, info);
+				continue;
+			}
+
+			info->hypothetical = false;
+			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
+			info->predOK = false;	/* set later, in indxpath.c */
+
+			/* Build targetlist using the completed indexprs data */
+			info->indextlist = build_index_tlist(root, info, relation);
 
 			/* We copy just the fields we need, not all of rd_indam */
 			amroutine = indexRelation->rd_indam;
@@ -283,6 +309,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			/* Fetch index opclass options */
 			info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
 
+			info->relam = indexRelation->rd_rel->relam;
+
 			/*
 			 * Fetch the ordering information for the index, if any.
 			 */
@@ -369,28 +397,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->nulls_first = NULL;
 			}
 
-			/*
-			 * Fetch the index expressions and predicate, if any.  We must
-			 * modify the copies we obtain from the relcache to have the
-			 * correct varno for the parent relation, so that they match up
-			 * correctly against qual clauses.
-			 */
-			info->indexprs = RelationGetIndexExpressions(indexRelation);
-			info->indpred = RelationGetIndexPredicate(indexRelation);
-			if (info->indexprs && varno != 1)
-				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
-			if (info->indpred && varno != 1)
-				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
-
-			/* Build targetlist using the completed indexprs data */
-			info->indextlist = build_index_tlist(root, info, relation);
-
-			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
-			info->predOK = false;	/* set later, in indxpath.c */
-			info->unique = index->indisunique;
-			info->immediate = index->indimmediate;
-			info->hypothetical = false;
-
 			/*
 			 * Estimate the index size.  If it's not a partial index, we lock
 			 * the number-of-tuples estimate to equal the parent table; if it
@@ -440,6 +446,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	}
 
 	rel->indexlist = indexinfos;
+	rel->partIndexlist = partIndexinfos;
 
 	rel->statlist = get_relation_statistics(rel, relation);
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 1f33fe13c1..f4d8bd9e4a 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -334,11 +334,11 @@ struct PlannerInfo
 
 	MemoryContext planner_cxt;	/* context holding PlannerInfo */
 
-	Cardinality	total_table_pages;	/* # of pages in all non-dummy tables of
+	Cardinality total_table_pages;	/* # of pages in all non-dummy tables of
 									 * query */
 
-	Selectivity	tuple_fraction; /* tuple_fraction passed to query_planner */
-	Cardinality	limit_tuples;	/* limit_tuples passed to query_planner */
+	Selectivity tuple_fraction; /* tuple_fraction passed to query_planner */
+	Cardinality limit_tuples;	/* limit_tuples passed to query_planner */
 
 	Index		qual_security_level;	/* minimum security_level for quals */
 	/* Note: qual_security_level is zero if there are no securityQuals */
@@ -681,7 +681,7 @@ typedef struct RelOptInfo
 	Relids		relids;			/* set of base relids (rangetable indexes) */
 
 	/* size estimates generated by planner */
-	Cardinality	rows;			/* estimated number of result tuples */
+	Cardinality rows;			/* estimated number of result tuples */
 
 	/* per-relation planner control flags */
 	bool		consider_startup;	/* keep cheap-startup-cost paths? */
@@ -716,9 +716,10 @@ typedef struct RelOptInfo
 	List	   *lateral_vars;	/* LATERAL Vars and PHVs referenced by rel */
 	Relids		lateral_referencers;	/* rels that reference me laterally */
 	List	   *indexlist;		/* list of IndexOptInfo */
+	List	   *partIndexlist;	/* list of IndexOptInfo */
 	List	   *statlist;		/* list of StatisticExtInfo */
 	BlockNumber pages;			/* size estimates derived from pg_class */
-	Cardinality	tuples;
+	Cardinality tuples;
 	double		allvisfrac;
 	Bitmapset  *eclass_indexes; /* Indexes in PlannerInfo's eq_classes list of
 								 * ECs that mention this rel */
@@ -841,7 +842,7 @@ struct IndexOptInfo
 
 	/* index-size statistics (from pg_class and elsewhere) */
 	BlockNumber pages;			/* number of disk pages in index */
-	Cardinality	tuples;			/* number of index tuples in index */
+	Cardinality tuples;			/* number of index tuples in index */
 	int			tree_height;	/* index tree height, or -1 if unknown */
 
 	/* index descriptor information */
@@ -1139,7 +1140,7 @@ typedef struct ParamPathInfo
 	NodeTag		type;
 
 	Relids		ppi_req_outer;	/* rels supplying parameters used by path */
-	Cardinality	ppi_rows;		/* estimated number of result tuples */
+	Cardinality ppi_rows;		/* estimated number of result tuples */
 	List	   *ppi_clauses;	/* join clauses available from outer rels */
 } ParamPathInfo;
 
@@ -1189,7 +1190,7 @@ typedef struct Path
 	int			parallel_workers;	/* desired # of workers; 0 = not parallel */
 
 	/* estimated size/costs for path (see costsize.c for more info) */
-	Cardinality	rows;			/* estimated number of result tuples */
+	Cardinality rows;			/* estimated number of result tuples */
 	Cost		startup_cost;	/* cost expended before fetching any tuples */
 	Cost		total_cost;		/* total cost (assuming all tuples fetched) */
 
@@ -1452,7 +1453,7 @@ typedef struct AppendPath
 	List	   *subpaths;		/* list of component Paths */
 	/* Index of first partial path in subpaths; list_length(subpaths) if none */
 	int			first_partial_path;
-	Cardinality	limit_tuples;	/* hard limit on output tuples, or -1 */
+	Cardinality limit_tuples;	/* hard limit on output tuples, or -1 */
 } AppendPath;
 
 #define IS_DUMMY_APPEND(p) \
@@ -1474,7 +1475,7 @@ typedef struct MergeAppendPath
 {
 	Path		path;
 	List	   *subpaths;		/* list of component Paths */
-	Cardinality	limit_tuples;	/* hard limit on output tuples, or -1 */
+	Cardinality limit_tuples;	/* hard limit on output tuples, or -1 */
 } MergeAppendPath;
 
 /*
@@ -1772,7 +1773,7 @@ typedef struct AggPath
 	Path	   *subpath;		/* path representing input source */
 	AggStrategy aggstrategy;	/* basic strategy, see nodes.h */
 	AggSplit	aggsplit;		/* agg-splitting mode, see nodes.h */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 	uint64		transitionSpace;	/* for pass-by-ref transition data */
 	List	   *groupClause;	/* a list of SortGroupClause's */
 	List	   *qual;			/* quals (HAVING quals), if any */
@@ -1786,7 +1787,7 @@ typedef struct GroupingSetData
 {
 	NodeTag		type;
 	List	   *set;			/* grouping set as list of sortgrouprefs */
-	Cardinality	numGroups;		/* est. number of result groups */
+	Cardinality numGroups;		/* est. number of result groups */
 } GroupingSetData;
 
 typedef struct RollupData
@@ -1795,7 +1796,7 @@ typedef struct RollupData
 	List	   *groupClause;	/* applicable subset of parse->groupClause */
 	List	   *gsets;			/* lists of integer indexes into groupClause */
 	List	   *gsets_data;		/* list of GroupingSetData */
-	Cardinality	numGroups;		/* est. number of result groups */
+	Cardinality numGroups;		/* est. number of result groups */
 	bool		hashable;		/* can be hashed */
 	bool		is_hashed;		/* to be implemented as a hashagg */
 } RollupData;
@@ -1846,7 +1847,7 @@ typedef struct SetOpPath
 	List	   *distinctList;	/* SortGroupClauses identifying target cols */
 	AttrNumber	flagColIdx;		/* where is the flag column, if any */
 	int			firstFlag;		/* flag value for first input relation */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 } SetOpPath;
 
 /*
@@ -1859,7 +1860,7 @@ typedef struct RecursiveUnionPath
 	Path	   *rightpath;
 	List	   *distinctList;	/* SortGroupClauses identifying target cols */
 	int			wtParam;		/* ID of Param representing work table */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 } RecursiveUnionPath;
 
 /*
@@ -2615,7 +2616,7 @@ typedef struct
 typedef struct
 {
 	bool		limit_needed;
-	Cardinality	limit_tuples;
+	Cardinality limit_tuples;
 	int64		count_est;
 	int64		offset_est;
 } FinalPathExtraData;
@@ -2646,15 +2647,15 @@ typedef struct JoinCostWorkspace
 	Cost		inner_rescan_run_cost;
 
 	/* private for cost_mergejoin code */
-	Cardinality	outer_rows;
-	Cardinality	inner_rows;
-	Cardinality	outer_skip_rows;
-	Cardinality	inner_skip_rows;
+	Cardinality outer_rows;
+	Cardinality inner_rows;
+	Cardinality outer_skip_rows;
+	Cardinality inner_skip_rows;
 
 	/* private for cost_hashjoin code */
 	int			numbuckets;
 	int			numbatches;
-	Cardinality	inner_rows_total;
+	Cardinality inner_rows_total;
 } JoinCostWorkspace;
 
 /*
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index bb5b7c47a4..1cc7f808cd 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -459,6 +459,19 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
  550 |     | 
 (12 rows)
 
+-- join pruning
+CREATE UNIQUE INDEX prt1_p1_a_idx ON prt1 (a);
+EXPLAIN (COSTS OFF)
+SELECT t1.a FROM prt1 t1 LEFT JOIN prt1 USING (a);
+           QUERY PLAN           
+--------------------------------
+ Append
+   ->  Seq Scan on prt1_p1 t1_1
+   ->  Seq Scan on prt1_p2 t1_2
+   ->  Seq Scan on prt1_p3 t1_3
+(4 rows)
+
+DROP INDEX prt1_p1_a_idx;
 -- bug with inadequate sort key representation
 SET enable_partitionwise_aggregate TO true;
 SET enable_hashjoin TO false;
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 67f506361f..5f70d5d851 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -90,6 +90,13 @@ SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
 SELECT t1.a, ss.t2a, ss.t2c FROM prt1 t1 LEFT JOIN LATERAL
 			  (SELECT t2.a AS t2a, t3.a AS t3a, t2.b t2b, t2.c t2c, least(t1.a,t2.a,t3.a) FROM prt1 t2 JOIN prt2 t3 ON (t2.a = t3.b)) ss
 			  ON t1.c = ss.t2c WHERE (t1.b + coalesce(ss.t2b, 0)) = 0 ORDER BY t1.a;
+-- join pruning
+CREATE UNIQUE INDEX prt1_p1_a_idx ON prt1 (a);
+
+EXPLAIN (COSTS OFF)
+SELECT t1.a FROM prt1 t1 LEFT JOIN prt1 USING (a);
+
+DROP INDEX prt1_p1_a_idx;
 
 -- bug with inadequate sort key representation
 SET enable_partitionwise_aggregate TO true;

alvherre@alvh.no-ip.org

almost 4 years ago

In reply to: Arne Roland (#5)

Re: missing indexes in indexlist with partitioned tables

Hmm, can you show cases of queries for which having this new
partIndexlist changes plans?

--
Álvaro Herrera PostgreSQL Developer — https://www.EnterpriseDB.com/
"Saca el libro que tu religión considere como el indicado para encontrar la
oración que traiga paz a tu alma. Luego rebootea el computador
y ve si funciona" (Carlos Duclós)

A.Roland@index.de

almost 4 years ago

In reply to: Alvaro Herrera (#6)

1 attachment(s)

Re: missing indexes in indexlist with partitioned tables

Hi!

Afaiac the join pruning where the outer table is a partitioned table is the relevant case.

I am not sure whether there are other cases.
The join pruning, which works great for plain relations since 9.0, falls short for partitioned tables, since the optimizer fails to prove uniqueness there.

In practical cases inner and outer tables are almost surely different ones, but I reattached a simpler example. It's the one, I came up with back in September.

I've seen this can be a reason to avoid partitioning for the time being, if the application relies on join pruning. I think generic views make it almost necessary to have it. If you had a different answer in mind, please don't hesitate to ask again.

Regards
Arne

________________________________
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Sent: Monday, January 17, 2022 7:16:08 PM
To: Arne Roland
Cc: Julien Rouhaud; pgsql-hackers
Subject: Re: missing indexes in indexlist with partitioned tables

Hmm, can you show cases of queries for which having this new
partIndexlist changes plans?

Julien Rouhaud

rjuju123@gmail.com

almost 4 years ago

In reply to: Arne Roland (#7)

Re: missing indexes in indexlist with partitioned tables

Hi,

On Mon, Jan 17, 2022 at 08:32:40PM +0000, Arne Roland wrote:

Afaiac the join pruning where the outer table is a partitioned table is the relevant case.

The last version of the patch now fails on all platform, with plan changes.

For instance:
https://cirrus-ci.com/task/4825629131538432
https://api.cirrus-ci.com/v1/artifact/task/4825629131538432/regress_diffs/src/test/regress/regression.diffs
diff -U3 /tmp/cirrus-ci-build/src/test/regress/expected/partition_join.out /tmp/cirrus-ci-build/src/test/regress/results/partition_join.out
--- /tmp/cirrus-ci-build/src/test/regress/expected/partition_join.out	2022-01-17 23:08:47.158198249 +0000
+++ /tmp/cirrus-ci-build/src/test/regress/results/partition_join.out	2022-01-17 23:12:34.163488567 +0000
@@ -4887,37 +4887,23 @@
 SET enable_partitionwise_join = on;
 EXPLAIN (COSTS OFF)
 SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
-                              QUERY PLAN
------------------------------------------------------------------------
+                           QUERY PLAN
+-----------------------------------------------------------------
  Limit
-   ->  Merge Append
-         Sort Key: x.id
-         ->  Merge Left Join
-               Merge Cond: (x_1.id = y_1.id)
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 x_1
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
-         ->  Merge Left Join
-               Merge Cond: (x_2.id = y_2.id)
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 x_2
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
-(11 rows)
+   ->  Append
+         ->  Index Only Scan using fract_t0_pkey on fract_t0 x_1
+         ->  Index Only Scan using fract_t1_pkey on fract_t1 x_2
+(4 rows)

 EXPLAIN (COSTS OFF)
 SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
-                                   QUERY PLAN
---------------------------------------------------------------------------------
+                                QUERY PLAN
+--------------------------------------------------------------------------
  Limit
-   ->  Merge Append
-         Sort Key: x.id DESC
-         ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t0_pkey on fract_t0 x_1
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
-                     Index Cond: (id = x_1.id)
-         ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t1_pkey on fract_t1 x_2
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
-                     Index Cond: (id = x_2.id)
-(11 rows)
+   ->  Append
+         ->  Index Only Scan Backward using fract_t1_pkey on fract_t1 x_2
+         ->  Index Only Scan Backward using fract_t0_pkey on fract_t0 x_1
+(4 rows)

alvherre@alvh.no-ip.org

almost 4 years ago

In reply to: Julien Rouhaud (#8)

Re: missing indexes in indexlist with partitioned tables

On 2022-Jan-18, Julien Rouhaud wrote:

SET enable_partitionwise_join = on;
EXPLAIN (COSTS OFF)
SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
-                              QUERY PLAN
------------------------------------------------------------------------
+                           QUERY PLAN
+-----------------------------------------------------------------
Limit
-   ->  Merge Append
-         Sort Key: x.id
-         ->  Merge Left Join
-               Merge Cond: (x_1.id = y_1.id)
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 x_1
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
-         ->  Merge Left Join
-               Merge Cond: (x_2.id = y_2.id)
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 x_2
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
-(11 rows)
+   ->  Append
+         ->  Index Only Scan using fract_t0_pkey on fract_t0 x_1
+         ->  Index Only Scan using fract_t1_pkey on fract_t1 x_2
+(4 rows)

Hmm, these plan changes look valid to me. A left self-join using the
primary key column? That looks optimizable all right.

I suspect that the author of partition-wise joins would want to change
these queries so that whatever was being tested by these self-joins is
tested by some other means (maybe just create an identical partitioned
table via CREATE TABLE fract_t2 ... ; INSERT INTO fract_t2 SELECT FROM
fract_t). But at the same time, the author of this patch should a) make
sure that the submitted patch updates these test results so that the
test pass, and also b) add some test cases to verify that his desired
behavior is tested somewhere, not just in a test case that's
incidentally broken by his patch.

What I still don't know is whether this patch is actually desirable or
not. If the only cases it affects is self-joins, is there much actual
value?

--
Álvaro Herrera Valdivia, Chile — https://www.EnterpriseDB.com/

#10

A.Roland@index.de

almost 4 years ago

In reply to: Alvaro Herrera (#9)

Re: missing indexes in indexlist with partitioned tables

Hi!

From: Alvaro Herrera <alvherre@alvh.no-ip.org>
[...]
Hmm, these plan changes look valid to me. A left self-join using the
primary key column? That looks optimizable all right.
[...]
What I still don't know is whether this patch is actually desirable or
not. If the only cases it affects is self-joins, is there much actual
value?

This is not really about self joins. That was just the most simple example, because otherwise we need a second table.
It's unique, it's not relevant whether it's the same table. In most cases it won't. I was talking about join pruning.

I suspect that the author of partition-wise joins would want to change
these queries so that whatever was being tested by these self-joins is
tested by some other means (maybe just create an identical partitioned
table via CREATE TABLE fract_t2 ... ; INSERT INTO fract_t2 SELECT FROM
fract_t). But at the same time, the author of this patch should

Your suggestion doesn't work, because with my patch we solve that case as well. We solve the general join pruning case. If we make the index non-unique however, we should be able to test the fractional case the same way.

b) add some test cases to verify that his desired
behavior is tested somewhere, not just in a test case that's
incidentally broken by his patch.

My patch already includes such a test, look at @@ -90,6 +90,13 @@ src/test/regress/sql/partition_join.sql
Since the selfjoin part was confusing to you, it might be worthwhile to do test that with two different tables. While I see no need to test in that way, I will adjust the patch so. Just to make it more clear for people looking at those tests in the future.

a) make

sure that the submitted patch updates these test results so that the
test pass [...]

Just for the record: I did run the tests, but I did miss that the commit of Tomas fix for fractional optimization is already on the master. Please note that this is a very new test case from a patch committed less than one week ago.

I'm glad Julien Rouhaud pointed out, that Tomas patch is committed it by now. That was very helpful to me, as I can now integrate the two tests.

@Álvaro Herrera:
If you want to help me, please don't put forward an abstract list of responsibilities. If anything I obviously need practical help, on how I can catch on recent changes quicker and without manual intervention. I don't have a modified buildfarm animal running, that tries to apply my patch and run regression tests for my patch on a daily basis.
Is there a simple way for me to check for that?

I will probably integrate those two tests, since they can work of similar structures without need to recreate the tables again and again. I have clear understanding how that new test works. I have to attend a few calls now, but I should be able to update the tests later.

Regards
Arne

#11

A.Roland@index.de

almost 4 years ago

In reply to: Arne Roland (#10)

1 attachment(s)

Re: missing indexes in indexlist with partitioned tables

Hi,

I came up with a slightly more intuitive way to force the different outcome for the fractional test, that does hardly change anything.

I'm not sure, whether the differentiation between fract_x and fract_t is worth it, since there shouldn't be any difference, but as mentioned before I added it for potential future clarity.

Thanks for your feedback again!

Regards

Arne

________________________________
From: Arne Roland
Sent: Wednesday, January 19, 2022 10:13:55 PM
To: Alvaro Herrera; Julien Rouhaud
Cc: pgsql-hackers
Subject: Re: missing indexes in indexlist with partitioned tables

Hi!

From: Alvaro Herrera <alvherre@alvh.no-ip.org>
[...]
Hmm, these plan changes look valid to me. A left self-join using the
primary key column? That looks optimizable all right.
[...]
What I still don't know is whether this patch is actually desirable or
not. If the only cases it affects is self-joins, is there much actual
value?

I suspect that the author of partition-wise joins would want to change
these queries so that whatever was being tested by these self-joins is
tested by some other means (maybe just create an identical partitioned
table via CREATE TABLE fract_t2 ... ; INSERT INTO fract_t2 SELECT FROM
fract_t). But at the same time, the author of this patch should

b) add some test cases to verify that his desired
behavior is tested somewhere, not just in a test case that's
incidentally broken by his patch.

a) make

sure that the submitted patch updates these test results so that the
test pass [...]

I'm glad Julien Rouhaud pointed out, that Tomas patch is committed it by now. That was very helpful to me, as I can now integrate the two tests.

Regards
Arne

Attachments:

0004-partIndexlistClean.patchtext/x-patch; name=0004-partIndexlistClean.patchDownload

diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 0ef70ad7f1..fe50919637 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -3504,7 +3504,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 	Assert(list_length(exprlist) == list_length(oprlist));
 
 	/* Short-circuit if no indexes... */
-	if (rel->indexlist == NIL)
+	if (rel->indexlist == NIL && rel->partIndexlist == NIL)
 		return false;
 
 	/*
@@ -3549,7 +3549,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 		return false;
 
 	/* Examine each index of the relation ... */
-	foreach(ic, rel->indexlist)
+	foreach(ic, list_concat(rel->indexlist, rel->partIndexlist))
 	{
 		IndexOptInfo *ind = (IndexOptInfo *) lfirst(ic);
 		int			c;
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 337f470d58..b04b3f88ad 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -23,6 +23,7 @@
 #include "postgres.h"
 
 #include "nodes/nodeFuncs.h"
+#include "nodes/nodes.h"
 #include "optimizer/clauses.h"
 #include "optimizer/joininfo.h"
 #include "optimizer/optimizer.h"
@@ -598,7 +599,7 @@ rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
 		 */
 		ListCell   *lc;
 
-		foreach(lc, rel->indexlist)
+		foreach(lc, list_concat(rel->indexlist, rel->partIndexlist))
 		{
 			IndexOptInfo *ind = (IndexOptInfo *) lfirst(lc);
 
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index a5002ad895..db6c3f2912 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -116,8 +116,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 {
 	Index		varno = rel->relid;
 	Relation	relation;
-	bool		hasindex;
 	List	   *indexinfos = NIL;
+	List	   *partIndexinfos = NIL;
 
 	/*
 	 * We need not lock the relation since it was already locked, either by
@@ -154,17 +154,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	/* Retrieve the parallel_workers reloption, or -1 if not set. */
 	rel->rel_parallel_workers = RelationGetParallelWorkers(relation, -1);
 
-	/*
-	 * Make list of indexes.  Ignore indexes on system catalogs if told to.
-	 * Don't bother with indexes for an inheritance parent, either.
-	 */
-	if (inhparent ||
-		(IgnoreSystemIndexes && IsSystemRelation(relation)))
-		hasindex = false;
-	else
-		hasindex = relation->rd_rel->relhasindex;
-
-	if (hasindex)
+	/* Make list of indexes. Ignore indexes on system catalogs if told to. */
+	if (!(IgnoreSystemIndexes && IsSystemRelation(relation)) && relation->rd_rel->relhasindex)
 	{
 		List	   *indexoidlist;
 		LOCKMODE	lmode;
@@ -213,10 +204,12 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			}
 
 			/*
-			 * Ignore partitioned indexes, since they are not usable for
-			 * queries.
+			 * Don't add partitioned indexes to the indexlist, since they are
+			 * not usable by the executor. If they are unique add them to the
+			 * partindexlist instead, to use for further pruning. If they
+			 * aren't that either, simply skip them.
 			 */
-			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			if (inhparent && (!index->indisunique || indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX))
 			{
 				index_close(indexRelation, NoLock);
 				continue;
@@ -264,7 +257,40 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->indexcollations[i] = indexRelation->rd_indcollation[i];
 			}
 
-			info->relam = indexRelation->rd_rel->relam;
+			/*
+			 * Fetch the index expressions and predicate, if any.  We must
+			 * modify the copies we obtain from the relcache to have the
+			 * correct varno for the parent relation, so that they match up
+			 * correctly against qual clauses.
+			 */
+			info->indexprs = RelationGetIndexExpressions(indexRelation);
+			info->indpred = RelationGetIndexPredicate(indexRelation);
+			if (info->indexprs && varno != 1)
+				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
+			if (info->indpred && varno != 1)
+				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
+
+			info->unique = index->indisunique;
+			info->immediate = index->indimmediate;
+
+			/*
+			 * Don't add partitioned indexes to the indexlist, add them to the
+			 * partindexlist instead, since they are not usable by the
+			 * executor.
+			 */
+			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			{
+				index_close(indexRelation, NoLock);
+				partIndexinfos = lappend(partIndexinfos, info);
+				continue;
+			}
+
+			info->hypothetical = false;
+			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
+			info->predOK = false;	/* set later, in indxpath.c */
+
+			/* Build targetlist using the completed indexprs data */
+			info->indextlist = build_index_tlist(root, info, relation);
 
 			/* We copy just the fields we need, not all of rd_indam */
 			amroutine = indexRelation->rd_indam;
@@ -284,6 +310,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			/* Fetch index opclass options */
 			info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
 
+			info->relam = indexRelation->rd_rel->relam;
+
 			/*
 			 * Fetch the ordering information for the index, if any.
 			 */
@@ -370,28 +398,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->nulls_first = NULL;
 			}
 
-			/*
-			 * Fetch the index expressions and predicate, if any.  We must
-			 * modify the copies we obtain from the relcache to have the
-			 * correct varno for the parent relation, so that they match up
-			 * correctly against qual clauses.
-			 */
-			info->indexprs = RelationGetIndexExpressions(indexRelation);
-			info->indpred = RelationGetIndexPredicate(indexRelation);
-			if (info->indexprs && varno != 1)
-				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
-			if (info->indpred && varno != 1)
-				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
-
-			/* Build targetlist using the completed indexprs data */
-			info->indextlist = build_index_tlist(root, info, relation);
-
-			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
-			info->predOK = false;	/* set later, in indxpath.c */
-			info->unique = index->indisunique;
-			info->immediate = index->indimmediate;
-			info->hypothetical = false;
-
 			/*
 			 * Estimate the index size.  If it's not a partial index, we lock
 			 * the number-of-tuples estimate to equal the parent table; if it
@@ -441,6 +447,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	}
 
 	rel->indexlist = indexinfos;
+	rel->partIndexlist = partIndexinfos;
 
 	rel->statlist = get_relation_statistics(rel, relation);
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 1f3845b3fe..4d0ed938ac 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -334,11 +334,11 @@ struct PlannerInfo
 
 	MemoryContext planner_cxt;	/* context holding PlannerInfo */
 
-	Cardinality	total_table_pages;	/* # of pages in all non-dummy tables of
+	Cardinality total_table_pages;	/* # of pages in all non-dummy tables of
 									 * query */
 
-	Selectivity	tuple_fraction; /* tuple_fraction passed to query_planner */
-	Cardinality	limit_tuples;	/* limit_tuples passed to query_planner */
+	Selectivity tuple_fraction; /* tuple_fraction passed to query_planner */
+	Cardinality limit_tuples;	/* limit_tuples passed to query_planner */
 
 	Index		qual_security_level;	/* minimum security_level for quals */
 	/* Note: qual_security_level is zero if there are no securityQuals */
@@ -681,7 +681,7 @@ typedef struct RelOptInfo
 	Relids		relids;			/* set of base relids (rangetable indexes) */
 
 	/* size estimates generated by planner */
-	Cardinality	rows;			/* estimated number of result tuples */
+	Cardinality rows;			/* estimated number of result tuples */
 
 	/* per-relation planner control flags */
 	bool		consider_startup;	/* keep cheap-startup-cost paths? */
@@ -716,9 +716,10 @@ typedef struct RelOptInfo
 	List	   *lateral_vars;	/* LATERAL Vars and PHVs referenced by rel */
 	Relids		lateral_referencers;	/* rels that reference me laterally */
 	List	   *indexlist;		/* list of IndexOptInfo */
+	List	   *partIndexlist;	/* list of IndexOptInfo */
 	List	   *statlist;		/* list of StatisticExtInfo */
 	BlockNumber pages;			/* size estimates derived from pg_class */
-	Cardinality	tuples;
+	Cardinality tuples;
 	double		allvisfrac;
 	Bitmapset  *eclass_indexes; /* Indexes in PlannerInfo's eq_classes list of
 								 * ECs that mention this rel */
@@ -841,7 +842,7 @@ struct IndexOptInfo
 
 	/* index-size statistics (from pg_class and elsewhere) */
 	BlockNumber pages;			/* number of disk pages in index */
-	Cardinality	tuples;			/* number of index tuples in index */
+	Cardinality tuples;			/* number of index tuples in index */
 	int			tree_height;	/* index tree height, or -1 if unknown */
 
 	/* index descriptor information */
@@ -1140,7 +1141,7 @@ typedef struct ParamPathInfo
 	NodeTag		type;
 
 	Relids		ppi_req_outer;	/* rels supplying parameters used by path */
-	Cardinality	ppi_rows;		/* estimated number of result tuples */
+	Cardinality ppi_rows;		/* estimated number of result tuples */
 	List	   *ppi_clauses;	/* join clauses available from outer rels */
 } ParamPathInfo;
 
@@ -1190,7 +1191,7 @@ typedef struct Path
 	int			parallel_workers;	/* desired # of workers; 0 = not parallel */
 
 	/* estimated size/costs for path (see costsize.c for more info) */
-	Cardinality	rows;			/* estimated number of result tuples */
+	Cardinality rows;			/* estimated number of result tuples */
 	Cost		startup_cost;	/* cost expended before fetching any tuples */
 	Cost		total_cost;		/* total cost (assuming all tuples fetched) */
 
@@ -1453,7 +1454,7 @@ typedef struct AppendPath
 	List	   *subpaths;		/* list of component Paths */
 	/* Index of first partial path in subpaths; list_length(subpaths) if none */
 	int			first_partial_path;
-	Cardinality	limit_tuples;	/* hard limit on output tuples, or -1 */
+	Cardinality limit_tuples;	/* hard limit on output tuples, or -1 */
 } AppendPath;
 
 #define IS_DUMMY_APPEND(p) \
@@ -1475,7 +1476,7 @@ typedef struct MergeAppendPath
 {
 	Path		path;
 	List	   *subpaths;		/* list of component Paths */
-	Cardinality	limit_tuples;	/* hard limit on output tuples, or -1 */
+	Cardinality limit_tuples;	/* hard limit on output tuples, or -1 */
 } MergeAppendPath;
 
 /*
@@ -1773,7 +1774,7 @@ typedef struct AggPath
 	Path	   *subpath;		/* path representing input source */
 	AggStrategy aggstrategy;	/* basic strategy, see nodes.h */
 	AggSplit	aggsplit;		/* agg-splitting mode, see nodes.h */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 	uint64		transitionSpace;	/* for pass-by-ref transition data */
 	List	   *groupClause;	/* a list of SortGroupClause's */
 	List	   *qual;			/* quals (HAVING quals), if any */
@@ -1787,7 +1788,7 @@ typedef struct GroupingSetData
 {
 	NodeTag		type;
 	List	   *set;			/* grouping set as list of sortgrouprefs */
-	Cardinality	numGroups;		/* est. number of result groups */
+	Cardinality numGroups;		/* est. number of result groups */
 } GroupingSetData;
 
 typedef struct RollupData
@@ -1796,7 +1797,7 @@ typedef struct RollupData
 	List	   *groupClause;	/* applicable subset of parse->groupClause */
 	List	   *gsets;			/* lists of integer indexes into groupClause */
 	List	   *gsets_data;		/* list of GroupingSetData */
-	Cardinality	numGroups;		/* est. number of result groups */
+	Cardinality numGroups;		/* est. number of result groups */
 	bool		hashable;		/* can be hashed */
 	bool		is_hashed;		/* to be implemented as a hashagg */
 } RollupData;
@@ -1847,7 +1848,7 @@ typedef struct SetOpPath
 	List	   *distinctList;	/* SortGroupClauses identifying target cols */
 	AttrNumber	flagColIdx;		/* where is the flag column, if any */
 	int			firstFlag;		/* flag value for first input relation */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 } SetOpPath;
 
 /*
@@ -1860,7 +1861,7 @@ typedef struct RecursiveUnionPath
 	Path	   *rightpath;
 	List	   *distinctList;	/* SortGroupClauses identifying target cols */
 	int			wtParam;		/* ID of Param representing work table */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 } RecursiveUnionPath;
 
 /*
@@ -2616,7 +2617,7 @@ typedef struct
 typedef struct
 {
 	bool		limit_needed;
-	Cardinality	limit_tuples;
+	Cardinality limit_tuples;
 	int64		count_est;
 	int64		offset_est;
 } FinalPathExtraData;
@@ -2647,15 +2648,15 @@ typedef struct JoinCostWorkspace
 	Cost		inner_rescan_run_cost;
 
 	/* private for cost_mergejoin code */
-	Cardinality	outer_rows;
-	Cardinality	inner_rows;
-	Cardinality	outer_skip_rows;
-	Cardinality	inner_skip_rows;
+	Cardinality outer_rows;
+	Cardinality inner_rows;
+	Cardinality outer_skip_rows;
+	Cardinality inner_skip_rows;
 
 	/* private for cost_hashjoin code */
 	int			numbuckets;
 	int			numbatches;
-	Cardinality	inner_rows_total;
+	Cardinality inner_rows_total;
 } JoinCostWorkspace;
 
 /*
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index bb5b7c47a4..562bad6544 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -4866,14 +4866,42 @@ SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON (t1.a = t2.a AND t1.b = t2
 CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
+CREATE TABLE fract_x (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
+CREATE TABLE fract_x0 PARTITION OF fract_x FOR VALUES FROM ('0') TO ('1000');
+CREATE TABLE fract_x1 PARTITION OF fract_x FOR VALUES FROM ('1000') TO ('2000');
 -- insert data
 INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
 ANALYZE fract_t;
--- verify plan; nested index only scans
+INSERT INTO fract_x (id) (SELECT generate_series(0, 1999));
+ANALYZE fract_x;
+SET max_parallel_workers_per_gather = 0;
+SET enable_partitionwise_join = on;
+-- verify partition pruning
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Limit
+   ->  Append
+         ->  Index Only Scan using fract_x0_pkey on fract_x0 x_1
+         ->  Index Only Scan using fract_x1_pkey on fract_x1 x_2
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Limit
+   ->  Append
+         ->  Index Only Scan using fract_x0_pkey on fract_x0 x_1
+         ->  Index Only Scan using fract_x1_pkey on fract_x1 x_2
+(4 rows)
+
+-- verify plan; nested index only scans
+EXPLAIN (COSTS OFF)
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
                               QUERY PLAN                               
 -----------------------------------------------------------------------
  Limit
@@ -4881,32 +4909,33 @@ SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
          Sort Key: x.id
          ->  Merge Left Join
                Merge Cond: (x_1.id = y_1.id)
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 x_1
+               ->  Index Only Scan using fract_x0_pkey on fract_x0 x_1
                ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
          ->  Merge Left Join
                Merge Cond: (x_2.id = y_2.id)
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 x_2
+               ->  Index Only Scan using fract_x1_pkey on fract_x1 x_2
                ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
 (11 rows)
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
                                    QUERY PLAN                                   
 --------------------------------------------------------------------------------
  Limit
    ->  Merge Append
          Sort Key: x.id DESC
          ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t0_pkey on fract_t0 x_1
+               ->  Index Only Scan Backward using fract_x0_pkey on fract_x0 x_1
                ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
                      Index Cond: (id = x_1.id)
          ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t1_pkey on fract_t1 x_2
+               ->  Index Only Scan Backward using fract_x1_pkey on fract_x1 x_2
                ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
                      Index Cond: (id = x_2.id)
 (11 rows)
 
 -- cleanup
 DROP TABLE fract_t;
+DROP TABLE fract_x;
 RESET max_parallel_workers_per_gather;
 RESET enable_partitionwise_join;
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 67f506361f..39e23911d0 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -1148,22 +1148,38 @@ CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
 
+CREATE TABLE fract_x (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
+CREATE TABLE fract_x0 PARTITION OF fract_x FOR VALUES FROM ('0') TO ('1000');
+CREATE TABLE fract_x1 PARTITION OF fract_x FOR VALUES FROM ('1000') TO ('2000');
 -- insert data
 INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
 ANALYZE fract_t;
+INSERT INTO fract_x (id) (SELECT generate_series(0, 1999));
+ANALYZE fract_x;
 
--- verify plan; nested index only scans
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 
+-- verify partition pruning
+SET max_parallel_workers_per_gather = 0;
+SET enable_partitionwise_join = on;
+
+EXPLAIN (COSTS OFF)
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+
+EXPLAIN (COSTS OFF)
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+
+-- verify plan; nested index only scans
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
 
 -- cleanup
 DROP TABLE fract_t;
+DROP TABLE fract_x;
 
 RESET max_parallel_workers_per_gather;
 RESET enable_partitionwise_join;

#12

Zhihong Yu

zyu@yugabyte.com

almost 4 years ago

In reply to: Arne Roland (#11)

Re: missing indexes in indexlist with partitioned tables

On Wed, Jan 19, 2022 at 1:50 PM Arne Roland <A.Roland@index.de> wrote:

Hi,

I came up with a slightly more intuitive way to force the different
outcome for the fractional test, that does hardly change anything.

I'm not sure, whether the differentiation between fract_x and fract_t is
worth it, since there shouldn't be any difference, but as mentioned before
I added it for potential future clarity.

Thanks for your feedback again!

Regards

Arne

------------------------------
*From:* Arne Roland
*Sent:* Wednesday, January 19, 2022 10:13:55 PM
*To:* Alvaro Herrera; Julien Rouhaud
*Cc:* pgsql-hackers
*Subject:* Re: missing indexes in indexlist with partitioned tables

Hi!

From: Alvaro Herrera <alvherre@alvh.no-ip.org>
[...]
Hmm, these plan changes look valid to me. A left self-join using the
primary key column? That looks optimizable all right.
[...]
What I still don't know is whether this patch is actually desirable or
not. If the only cases it affects is self-joins, is there much actual
value?

This is not really about self joins. That was just the most simple
example, because otherwise we need a second table.
It's unique, it's not relevant whether it's the same table. In most cases
it won't. I was talking about join pruning.

I suspect that the author of partition-wise joins would want to change
these queries so that whatever was being tested by these self-joins is
tested by some other means (maybe just create an identical partitioned
table via CREATE TABLE fract_t2 ... ; INSERT INTO fract_t2 SELECT FROM
fract_t). But at the same time, the author of this patch should

Your suggestion doesn't work, because with my patch we solve that case as
well. We solve the general join pruning case. If we make the index
non-unique however, we should be able to test the fractional case the
same way.

b) add some test cases to verify that his desired
behavior is tested somewhere, not just in a test case that's
incidentally broken by his patch.

My patch already includes such a test, look at @@ -90,6 +90,13 @@
src/test/regress/sql/partition_join.sql
Since the selfjoin part was confusing to you, it might be worthwhile to do
test that with two different tables. While I see no need to test in that
way, I will adjust the patch so. Just to make it more clear for people
looking at those tests in the future.

a) make

sure that the submitted patch updates these test results so that the
test pass [...]

Just for the record: I did run the tests, but I did miss that the commit
of Tomas fix for fractional optimization is already on the master. Please
note that this is a very new test case from a patch committed less than one
week ago.

I'm glad Julien Rouhaud pointed out, that Tomas patch is committed it by
now. That was very helpful to me, as I can now integrate the two tests.

@Álvaro Herrera:
If you want to help me, please don't put forward an abstract list of
responsibilities. If anything I obviously need practical help, on how I can
catch on recent changes quicker and without manual intervention. I don't
have a modified buildfarm animal running, that tries to apply my patch and
run regression tests for my patch on a daily basis.
Is there a simple way for me to check for that?

I will probably integrate those two tests, since they can work of similar
structures without need to recreate the tables again and again. I have
clear understanding how that new test works. I have to attend a few calls
now, but I should be able to update the tests later.

Regards
Arne

Hi,

-           if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+           if (inhparent && (!index->indisunique ||
indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX))

The check on RELKIND_PARTITIONED_INDEX seems to negate what the comment
above says:

+ * Don't add partitioned indexes to the indexlist

Cheers

#13

alvherre@alvh.no-ip.org

almost 4 years ago

In reply to: Arne Roland (#10)

Re: missing indexes in indexlist with partitioned tables

On 2022-Jan-19, Arne Roland wrote:

a) make sure that the submitted patch updates these test results so
that the test pass [...]

Just for the record: I did run the tests, but I did miss that the
commit of Tomas fix for fractional optimization is already on the
master. Please note that this is a very new test case from a patch
committed less than one week ago.

Ah, apologies, I didn't realize that that test was so new.

If anything I obviously need practical help, on how
I can catch on recent changes quicker and without manual intervention.
I don't have a modified buildfarm animal running, that tries to apply
my patch and run regression tests for my patch on a daily basis.

See src/tools/ci/README (for multi-platform testing of patches on
several platforms) and http://commitfest.cputube.org/

--
Álvaro Herrera 39°49'30"S 73°17'W — https://www.EnterpriseDB.com/

#14

A.Roland@index.de

almost 4 years ago

In reply to: Zhihong Yu (#12)

Re: missing indexes in indexlist with partitioned tables

Hi!

From: Zhihong Yu <zyu@yugabyte.com>
Subject: Re: missing indexes in indexlist with partitioned tables

Hi,
-           if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+           if (inhparent && (!index->indisunique || indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX))
The check on RELKIND_PARTITIONED_INDEX seems to negate what the comment above says:

+ * Don't add partitioned indexes to the indexlist

Cheers

The comment at my end goes on:

/*
* Don't add partitioned indexes to the indexlist, since they are
* not usable by the executor. If they are unique add them to the
* partindexlist instead, to use for further pruning. If they
* aren't that either, simply skip them.
*/

Regarding the structure: I think, that we probably should remove the first two sentences here. They reoccur 50 lines below anyways, which seems a dubious practice. The logic that enforces the first two sentences is mainly down below, so that place is probably on one to keep.

Regarding the semantics: This is sort of what the statement checks for (skip for inhparent, if not unique or not partitioned index), i.e. it checks for the case, where the index shouldn't be added to either list.

Side note: I personally think the name inhparent is mildly confusing, since it's not really about inheritance. I don't have a significantly better idea though.

From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Sent: Wednesday, January 19, 2022 23:26

Ah, apologies, I didn't realize that that test was so new.

No offense taken. Unless one was involved in the creation of the corresponding patch, it's unreasonable to know that. I like the second part of your message very much:

See src/tools/ci/README (for multi-platform testing of patches on
several platforms) and http://commitfest.cputube.org/

Thanks, I didn't know of cputube. Neat! That's pretty much what I was looking for!
Is there a way to get an email notification if some machine fails (turns bright red)? For the threads I'm explicitly subscribed to, that would seem helpful to me.

Regards
Arne

#15

Amit Langote

amitlangote09@gmail.com

almost 4 years ago

In reply to: Arne Roland (#14)

Re: missing indexes in indexlist with partitioned tables

Hi,

On Mon, Jan 24, 2022 at 9:30 PM Arne Roland <A.Roland@index.de> wrote:

The comment at my end goes on:

/*
* Don't add partitioned indexes to the indexlist, since they are
* not usable by the executor. If they are unique add them to the
* partindexlist instead, to use for further pruning. If they
* aren't that either, simply skip them.
*/

"partindexlist" really made me think about a list of "partial indexes"
for some reason. I think maybe "partedindexlist" is what you are
looking for; "parted" is commonly used as short for "partitioned" when
naming variables.

The comment only mentions "further pruning" as to what partitioned
indexes are to be remembered in RelOptInfo, but it's not clear what
that means. It may help to be more specific.

Finally, I don't understand why we need a separate field to store
indexes found in partitioned base relations. AFAICS, nothing but the
sites you are interested in (relation_has_unique_index_for() and
rel_supports_distinctness()) would ever bother to look at a
partitioned base relation's indexlist. Do you think putting them into
in indexlist might break something?

Regarding the semantics: This is sort of what the statement checks for (skip for inhparent, if not unique or not partitioned index), i.e. it checks for the case, where the index shouldn't be added to either list.

Side note: I personally think the name inhparent is mildly confusing, since it's not really about inheritance. I don't have a significantly better idea though.

Partitioned tables are "inheritance parent", so share the same code as
what traditional inheritance parents have always used for planning.

--
Amit Langote
EDB: http://www.enterprisedb.com

#16

A.Roland@index.de

almost 4 years ago

In reply to: Amit Langote (#15)

1 attachment(s)

Re: missing indexes in indexlist with partitioned tables

Hi!

From: Amit Langote <amitlangote09@gmail.com>
Sent: Tuesday, January 25, 2022 09:04
Subject: Re: missing indexes in indexlist with partitioned tables

[...]
"partindexlist" really made me think about a list of "partial indexes"
for some reason. I think maybe "partedindexlist" is what you are
looking for; "parted" is commonly used as short for "partitioned" when
naming variables.

The comment only mentions "further pruning" as to what partitioned
indexes are to be remembered in RelOptInfo, but it's not clear what
that means. It may help to be more specific.

Thanks for the feedback! I've changed that. The current version is attached.

Finally, I don't understand why we need a separate field to store
indexes found in partitioned base relations. AFAICS, nothing but the
sites you are interested in (relation_has_unique_index_for() and
rel_supports_distinctness()) would ever bother to look at a
partitioned base relation's indexlist. Do you think putting them into
in indexlist might break something?

I have thought about that before. AFAICT there is nothing in core, which breaks. However I am not sure, I want to mix those two kinds of index nodes. First of all the structure is different, partedIndexes don't have physical attributes after all. This is technical implementation detail relating to the current promise, that entries of the indexlist are indexes we can use. And by use, I mean use for statistics or the executor.
I'm more concerned about future changes regarding the order and optimization of processing harder here. The order in which we do things in the planner is a bit messy, and I wouldn't mind seeing details about that change. Looking at the current wacky order in the optimizer, I'm not convinced, that nothing will want to have a look at the indexlist, before partitioned tables are unpacked.

Since it would be easy to introduce this new variable later, wouldn't mind adding it to the indexlist directly for now. But changing the underlying promise of what it contains, seems noteworthy and more intrusive to me.

Side note: I personally think the name inhparent is mildly confusing, since it's not really about inheritance. I don't have a significantly better idea though.

Partitioned tables are "inheritance parent", so share the same code as
what traditional inheritance parents have always used for planning.

I recall that manual partitioning via inheritance, that was cumbersome. Though that minor historical detail was not, what I was referring to. There are a lot of other cases, that cause us to set inhparent. IIRC we use this flag in some ddl commands, which have nothing to do with inheritance. It essentially is used as a variant to skip the indexlist creation. If such hacks weren't there, we could simply check for the relkind and indisunique.

Regards
Arne

Attachments:

0005-partIndexlistClean.patchtext/x-patch; name=0005-partIndexlistClean.patchDownload

diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 0ef70ad7f1..5225076df3 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -3504,7 +3504,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 	Assert(list_length(exprlist) == list_length(oprlist));
 
 	/* Short-circuit if no indexes... */
-	if (rel->indexlist == NIL)
+	if (rel->indexlist == NIL && rel->partedIndexlist == NIL)
 		return false;
 
 	/*
@@ -3549,7 +3549,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 		return false;
 
 	/* Examine each index of the relation ... */
-	foreach(ic, rel->indexlist)
+	foreach(ic, list_concat(rel->indexlist, rel->partedIndexlist))
 	{
 		IndexOptInfo *ind = (IndexOptInfo *) lfirst(ic);
 		int			c;
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 337f470d58..16ce443ec9 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -23,6 +23,7 @@
 #include "postgres.h"
 
 #include "nodes/nodeFuncs.h"
+#include "nodes/nodes.h"
 #include "optimizer/clauses.h"
 #include "optimizer/joininfo.h"
 #include "optimizer/optimizer.h"
@@ -598,7 +599,7 @@ rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
 		 */
 		ListCell   *lc;
 
-		foreach(lc, rel->indexlist)
+		foreach(lc, list_concat(rel->indexlist, rel->partedIndexlist))
 		{
 			IndexOptInfo *ind = (IndexOptInfo *) lfirst(lc);
 
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index a5002ad895..2451b2ae79 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -116,8 +116,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 {
 	Index		varno = rel->relid;
 	Relation	relation;
-	bool		hasindex;
 	List	   *indexinfos = NIL;
+	List	   *partedIndexinfos = NIL;
 
 	/*
 	 * We need not lock the relation since it was already locked, either by
@@ -154,17 +154,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	/* Retrieve the parallel_workers reloption, or -1 if not set. */
 	rel->rel_parallel_workers = RelationGetParallelWorkers(relation, -1);
 
-	/*
-	 * Make list of indexes.  Ignore indexes on system catalogs if told to.
-	 * Don't bother with indexes for an inheritance parent, either.
-	 */
-	if (inhparent ||
-		(IgnoreSystemIndexes && IsSystemRelation(relation)))
-		hasindex = false;
-	else
-		hasindex = relation->rd_rel->relhasindex;
-
-	if (hasindex)
+	/* Make list of indexes. Ignore indexes on system catalogs if told to. */
+	if (!(IgnoreSystemIndexes && IsSystemRelation(relation)) && relation->rd_rel->relhasindex)
 	{
 		List	   *indexoidlist;
 		LOCKMODE	lmode;
@@ -213,10 +204,13 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			}
 
 			/*
-			 * Ignore partitioned indexes, since they are not usable for
-			 * queries.
+			 * Don't add partitioned indexes to the indexlist, since they are
+			 * not usable by the executor. If they are unique add them to the
+			 * partedIndexlist instead, to use for further pruning. That is
+			 * relevant for the join pruning, if the outer relation is partitioned.
+			 * If they aren't that either, simply skip them.
 			 */
-			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			if (inhparent && (!index->indisunique || indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX))
 			{
 				index_close(indexRelation, NoLock);
 				continue;
@@ -264,7 +258,40 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->indexcollations[i] = indexRelation->rd_indcollation[i];
 			}
 
-			info->relam = indexRelation->rd_rel->relam;
+			/*
+			 * Fetch the index expressions and predicate, if any.  We must
+			 * modify the copies we obtain from the relcache to have the
+			 * correct varno for the parent relation, so that they match up
+			 * correctly against qual clauses.
+			 */
+			info->indexprs = RelationGetIndexExpressions(indexRelation);
+			info->indpred = RelationGetIndexPredicate(indexRelation);
+			if (info->indexprs && varno != 1)
+				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
+			if (info->indpred && varno != 1)
+				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
+
+			info->unique = index->indisunique;
+			info->immediate = index->indimmediate;
+
+			/*
+			 * Don't add partitioned indexes to the indexlist, add them to the
+			 * partedIndexlist instead, since they are not usable by the
+			 * executor.
+			 */
+			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			{
+				index_close(indexRelation, NoLock);
+				partedIndexinfos = lappend(partedIndexinfos, info);
+				continue;
+			}
+
+			info->hypothetical = false;
+			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
+			info->predOK = false;	/* set later, in indxpath.c */
+
+			/* Build targetlist using the completed indexprs data */
+			info->indextlist = build_index_tlist(root, info, relation);
 
 			/* We copy just the fields we need, not all of rd_indam */
 			amroutine = indexRelation->rd_indam;
@@ -284,6 +311,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			/* Fetch index opclass options */
 			info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
 
+			info->relam = indexRelation->rd_rel->relam;
+
 			/*
 			 * Fetch the ordering information for the index, if any.
 			 */
@@ -370,28 +399,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->nulls_first = NULL;
 			}
 
-			/*
-			 * Fetch the index expressions and predicate, if any.  We must
-			 * modify the copies we obtain from the relcache to have the
-			 * correct varno for the parent relation, so that they match up
-			 * correctly against qual clauses.
-			 */
-			info->indexprs = RelationGetIndexExpressions(indexRelation);
-			info->indpred = RelationGetIndexPredicate(indexRelation);
-			if (info->indexprs && varno != 1)
-				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
-			if (info->indpred && varno != 1)
-				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
-
-			/* Build targetlist using the completed indexprs data */
-			info->indextlist = build_index_tlist(root, info, relation);
-
-			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
-			info->predOK = false;	/* set later, in indxpath.c */
-			info->unique = index->indisunique;
-			info->immediate = index->indimmediate;
-			info->hypothetical = false;
-
 			/*
 			 * Estimate the index size.  If it's not a partial index, we lock
 			 * the number-of-tuples estimate to equal the parent table; if it
@@ -441,6 +448,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	}
 
 	rel->indexlist = indexinfos;
+	rel->partedIndexlist = partedIndexinfos;
 
 	rel->statlist = get_relation_statistics(rel, relation);
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 1f3845b3fe..8a46e0ca70 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -334,11 +334,11 @@ struct PlannerInfo
 
 	MemoryContext planner_cxt;	/* context holding PlannerInfo */
 
-	Cardinality	total_table_pages;	/* # of pages in all non-dummy tables of
+	Cardinality total_table_pages;	/* # of pages in all non-dummy tables of
 									 * query */
 
-	Selectivity	tuple_fraction; /* tuple_fraction passed to query_planner */
-	Cardinality	limit_tuples;	/* limit_tuples passed to query_planner */
+	Selectivity tuple_fraction; /* tuple_fraction passed to query_planner */
+	Cardinality limit_tuples;	/* limit_tuples passed to query_planner */
 
 	Index		qual_security_level;	/* minimum security_level for quals */
 	/* Note: qual_security_level is zero if there are no securityQuals */
@@ -681,7 +681,7 @@ typedef struct RelOptInfo
 	Relids		relids;			/* set of base relids (rangetable indexes) */
 
 	/* size estimates generated by planner */
-	Cardinality	rows;			/* estimated number of result tuples */
+	Cardinality rows;			/* estimated number of result tuples */
 
 	/* per-relation planner control flags */
 	bool		consider_startup;	/* keep cheap-startup-cost paths? */
@@ -716,9 +716,10 @@ typedef struct RelOptInfo
 	List	   *lateral_vars;	/* LATERAL Vars and PHVs referenced by rel */
 	Relids		lateral_referencers;	/* rels that reference me laterally */
 	List	   *indexlist;		/* list of IndexOptInfo */
+	List	   *partedIndexlist;	/* list of IndexOptInfo */
 	List	   *statlist;		/* list of StatisticExtInfo */
 	BlockNumber pages;			/* size estimates derived from pg_class */
-	Cardinality	tuples;
+	Cardinality tuples;
 	double		allvisfrac;
 	Bitmapset  *eclass_indexes; /* Indexes in PlannerInfo's eq_classes list of
 								 * ECs that mention this rel */
@@ -841,7 +842,7 @@ struct IndexOptInfo
 
 	/* index-size statistics (from pg_class and elsewhere) */
 	BlockNumber pages;			/* number of disk pages in index */
-	Cardinality	tuples;			/* number of index tuples in index */
+	Cardinality tuples;			/* number of index tuples in index */
 	int			tree_height;	/* index tree height, or -1 if unknown */
 
 	/* index descriptor information */
@@ -1140,7 +1141,7 @@ typedef struct ParamPathInfo
 	NodeTag		type;
 
 	Relids		ppi_req_outer;	/* rels supplying parameters used by path */
-	Cardinality	ppi_rows;		/* estimated number of result tuples */
+	Cardinality ppi_rows;		/* estimated number of result tuples */
 	List	   *ppi_clauses;	/* join clauses available from outer rels */
 } ParamPathInfo;
 
@@ -1190,7 +1191,7 @@ typedef struct Path
 	int			parallel_workers;	/* desired # of workers; 0 = not parallel */
 
 	/* estimated size/costs for path (see costsize.c for more info) */
-	Cardinality	rows;			/* estimated number of result tuples */
+	Cardinality rows;			/* estimated number of result tuples */
 	Cost		startup_cost;	/* cost expended before fetching any tuples */
 	Cost		total_cost;		/* total cost (assuming all tuples fetched) */
 
@@ -1453,7 +1454,7 @@ typedef struct AppendPath
 	List	   *subpaths;		/* list of component Paths */
 	/* Index of first partial path in subpaths; list_length(subpaths) if none */
 	int			first_partial_path;
-	Cardinality	limit_tuples;	/* hard limit on output tuples, or -1 */
+	Cardinality limit_tuples;	/* hard limit on output tuples, or -1 */
 } AppendPath;
 
 #define IS_DUMMY_APPEND(p) \
@@ -1475,7 +1476,7 @@ typedef struct MergeAppendPath
 {
 	Path		path;
 	List	   *subpaths;		/* list of component Paths */
-	Cardinality	limit_tuples;	/* hard limit on output tuples, or -1 */
+	Cardinality limit_tuples;	/* hard limit on output tuples, or -1 */
 } MergeAppendPath;
 
 /*
@@ -1773,7 +1774,7 @@ typedef struct AggPath
 	Path	   *subpath;		/* path representing input source */
 	AggStrategy aggstrategy;	/* basic strategy, see nodes.h */
 	AggSplit	aggsplit;		/* agg-splitting mode, see nodes.h */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 	uint64		transitionSpace;	/* for pass-by-ref transition data */
 	List	   *groupClause;	/* a list of SortGroupClause's */
 	List	   *qual;			/* quals (HAVING quals), if any */
@@ -1787,7 +1788,7 @@ typedef struct GroupingSetData
 {
 	NodeTag		type;
 	List	   *set;			/* grouping set as list of sortgrouprefs */
-	Cardinality	numGroups;		/* est. number of result groups */
+	Cardinality numGroups;		/* est. number of result groups */
 } GroupingSetData;
 
 typedef struct RollupData
@@ -1796,7 +1797,7 @@ typedef struct RollupData
 	List	   *groupClause;	/* applicable subset of parse->groupClause */
 	List	   *gsets;			/* lists of integer indexes into groupClause */
 	List	   *gsets_data;		/* list of GroupingSetData */
-	Cardinality	numGroups;		/* est. number of result groups */
+	Cardinality numGroups;		/* est. number of result groups */
 	bool		hashable;		/* can be hashed */
 	bool		is_hashed;		/* to be implemented as a hashagg */
 } RollupData;
@@ -1847,7 +1848,7 @@ typedef struct SetOpPath
 	List	   *distinctList;	/* SortGroupClauses identifying target cols */
 	AttrNumber	flagColIdx;		/* where is the flag column, if any */
 	int			firstFlag;		/* flag value for first input relation */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 } SetOpPath;
 
 /*
@@ -1860,7 +1861,7 @@ typedef struct RecursiveUnionPath
 	Path	   *rightpath;
 	List	   *distinctList;	/* SortGroupClauses identifying target cols */
 	int			wtParam;		/* ID of Param representing work table */
-	Cardinality	numGroups;		/* estimated number of groups in input */
+	Cardinality numGroups;		/* estimated number of groups in input */
 } RecursiveUnionPath;
 
 /*
@@ -2616,7 +2617,7 @@ typedef struct
 typedef struct
 {
 	bool		limit_needed;
-	Cardinality	limit_tuples;
+	Cardinality limit_tuples;
 	int64		count_est;
 	int64		offset_est;
 } FinalPathExtraData;
@@ -2647,15 +2648,15 @@ typedef struct JoinCostWorkspace
 	Cost		inner_rescan_run_cost;
 
 	/* private for cost_mergejoin code */
-	Cardinality	outer_rows;
-	Cardinality	inner_rows;
-	Cardinality	outer_skip_rows;
-	Cardinality	inner_skip_rows;
+	Cardinality outer_rows;
+	Cardinality inner_rows;
+	Cardinality outer_skip_rows;
+	Cardinality inner_skip_rows;
 
 	/* private for cost_hashjoin code */
 	int			numbuckets;
 	int			numbatches;
-	Cardinality	inner_rows_total;
+	Cardinality inner_rows_total;
 } JoinCostWorkspace;
 
 /*
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index bb5b7c47a4..562bad6544 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -4866,14 +4866,42 @@ SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON (t1.a = t2.a AND t1.b = t2
 CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
+CREATE TABLE fract_x (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
+CREATE TABLE fract_x0 PARTITION OF fract_x FOR VALUES FROM ('0') TO ('1000');
+CREATE TABLE fract_x1 PARTITION OF fract_x FOR VALUES FROM ('1000') TO ('2000');
 -- insert data
 INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
 ANALYZE fract_t;
--- verify plan; nested index only scans
+INSERT INTO fract_x (id) (SELECT generate_series(0, 1999));
+ANALYZE fract_x;
+SET max_parallel_workers_per_gather = 0;
+SET enable_partitionwise_join = on;
+-- verify partition pruning
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Limit
+   ->  Append
+         ->  Index Only Scan using fract_x0_pkey on fract_x0 x_1
+         ->  Index Only Scan using fract_x1_pkey on fract_x1 x_2
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Limit
+   ->  Append
+         ->  Index Only Scan using fract_x0_pkey on fract_x0 x_1
+         ->  Index Only Scan using fract_x1_pkey on fract_x1 x_2
+(4 rows)
+
+-- verify plan; nested index only scans
+EXPLAIN (COSTS OFF)
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
                               QUERY PLAN                               
 -----------------------------------------------------------------------
  Limit
@@ -4881,32 +4909,33 @@ SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
          Sort Key: x.id
          ->  Merge Left Join
                Merge Cond: (x_1.id = y_1.id)
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 x_1
+               ->  Index Only Scan using fract_x0_pkey on fract_x0 x_1
                ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
          ->  Merge Left Join
                Merge Cond: (x_2.id = y_2.id)
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 x_2
+               ->  Index Only Scan using fract_x1_pkey on fract_x1 x_2
                ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
 (11 rows)
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
                                    QUERY PLAN                                   
 --------------------------------------------------------------------------------
  Limit
    ->  Merge Append
          Sort Key: x.id DESC
          ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t0_pkey on fract_t0 x_1
+               ->  Index Only Scan Backward using fract_x0_pkey on fract_x0 x_1
                ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
                      Index Cond: (id = x_1.id)
          ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t1_pkey on fract_t1 x_2
+               ->  Index Only Scan Backward using fract_x1_pkey on fract_x1 x_2
                ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
                      Index Cond: (id = x_2.id)
 (11 rows)
 
 -- cleanup
 DROP TABLE fract_t;
+DROP TABLE fract_x;
 RESET max_parallel_workers_per_gather;
 RESET enable_partitionwise_join;
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 67f506361f..39e23911d0 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -1148,22 +1148,38 @@ CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
 
+CREATE TABLE fract_x (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
+CREATE TABLE fract_x0 PARTITION OF fract_x FOR VALUES FROM ('0') TO ('1000');
+CREATE TABLE fract_x1 PARTITION OF fract_x FOR VALUES FROM ('1000') TO ('2000');
 -- insert data
 INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
 ANALYZE fract_t;
+INSERT INTO fract_x (id) (SELECT generate_series(0, 1999));
+ANALYZE fract_x;
 
--- verify plan; nested index only scans
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 
+-- verify partition pruning
+SET max_parallel_workers_per_gather = 0;
+SET enable_partitionwise_join = on;
+
+EXPLAIN (COSTS OFF)
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+
+EXPLAIN (COSTS OFF)
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+
+-- verify plan; nested index only scans
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
 
 -- cleanup
 DROP TABLE fract_t;
+DROP TABLE fract_x;
 
 RESET max_parallel_workers_per_gather;
 RESET enable_partitionwise_join;

#17

A.Roland@index.de

over 3 years ago

In reply to: Arne Roland (#16)

1 attachment(s)

Re: missing indexes in indexlist with partitioned tables

Hi!

Attached a rebased version of the patch.

Regards
Arne

________________________________
From: Arne Roland
Sent: Monday, January 31, 2022 19:14
To: Amit Langote
Cc: Zhihong Yu; Alvaro Herrera; Julien Rouhaud; pgsql-hackers
Subject: Re: missing indexes in indexlist with partitioned tables

Hi!

From: Amit Langote <amitlangote09@gmail.com>
Sent: Tuesday, January 25, 2022 09:04
Subject: Re: missing indexes in indexlist with partitioned tables

[...]
"partindexlist" really made me think about a list of "partial indexes"
for some reason. I think maybe "partedindexlist" is what you are
looking for; "parted" is commonly used as short for "partitioned" when
naming variables.

The comment only mentions "further pruning" as to what partitioned
indexes are to be remembered in RelOptInfo, but it's not clear what
that means. It may help to be more specific.

Thanks for the feedback! I've changed that. The current version is attached.

Finally, I don't understand why we need a separate field to store
indexes found in partitioned base relations. AFAICS, nothing but the
sites you are interested in (relation_has_unique_index_for() and
rel_supports_distinctness()) would ever bother to look at a
partitioned base relation's indexlist. Do you think putting them into
in indexlist might break something?

Side note: I personally think the name inhparent is mildly confusing, since it's not really about inheritance. I don't have a significantly better idea though.

Partitioned tables are "inheritance parent", so share the same code as
what traditional inheritance parents have always used for planning.

Regards
Arne

Attachments:

0006-partIndexlistClean.patchtext/x-patch; name=0006-partIndexlistClean.patchDownload

diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 7d176e7b00..31cdc3a7f7 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -3502,7 +3502,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 	Assert(list_length(exprlist) == list_length(oprlist));
 
 	/* Short-circuit if no indexes... */
-	if (rel->indexlist == NIL)
+	if (rel->indexlist == NIL && rel->partedIndexlist == NIL)
 		return false;
 
 	/*
@@ -3547,7 +3547,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel,
 		return false;
 
 	/* Examine each index of the relation ... */
-	foreach(ic, rel->indexlist)
+	foreach(ic, list_concat(rel->indexlist, rel->partedIndexlist))
 	{
 		IndexOptInfo *ind = (IndexOptInfo *) lfirst(ic);
 		int			c;
diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c
index 337f470d58..16ce443ec9 100644
--- a/src/backend/optimizer/plan/analyzejoins.c
+++ b/src/backend/optimizer/plan/analyzejoins.c
@@ -23,6 +23,7 @@
 #include "postgres.h"
 
 #include "nodes/nodeFuncs.h"
+#include "nodes/nodes.h"
 #include "optimizer/clauses.h"
 #include "optimizer/joininfo.h"
 #include "optimizer/optimizer.h"
@@ -598,7 +599,7 @@ rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel)
 		 */
 		ListCell   *lc;
 
-		foreach(lc, rel->indexlist)
+		foreach(lc, list_concat(rel->indexlist, rel->partedIndexlist))
 		{
 			IndexOptInfo *ind = (IndexOptInfo *) lfirst(lc);
 
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 5012bfe142..7cf85875d5 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -116,8 +116,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 {
 	Index		varno = rel->relid;
 	Relation	relation;
-	bool		hasindex;
 	List	   *indexinfos = NIL;
+	List	   *partedIndexinfos = NIL;
 
 	/*
 	 * We need not lock the relation since it was already locked, either by
@@ -154,17 +154,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	/* Retrieve the parallel_workers reloption, or -1 if not set. */
 	rel->rel_parallel_workers = RelationGetParallelWorkers(relation, -1);
 
-	/*
-	 * Make list of indexes.  Ignore indexes on system catalogs if told to.
-	 * Don't bother with indexes for an inheritance parent, either.
-	 */
-	if (inhparent ||
-		(IgnoreSystemIndexes && IsSystemRelation(relation)))
-		hasindex = false;
-	else
-		hasindex = relation->rd_rel->relhasindex;
-
-	if (hasindex)
+	/* Make list of indexes. Ignore indexes on system catalogs if told to. */
+	if (!(IgnoreSystemIndexes && IsSystemRelation(relation)) && relation->rd_rel->relhasindex)
 	{
 		List	   *indexoidlist;
 		LOCKMODE	lmode;
@@ -213,10 +204,13 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			}
 
 			/*
-			 * Ignore partitioned indexes, since they are not usable for
-			 * queries.
+			 * Don't add partitioned indexes to the indexlist, since they are
+			 * not usable by the executor. If they are unique add them to the
+			 * partedIndexlist instead, to use for further pruning. That is
+			 * relevant for the join pruning, if the outer relation is partitioned.
+			 * If they aren't that either, simply skip them.
 			 */
-			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			if (inhparent && (!index->indisunique || indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX))
 			{
 				index_close(indexRelation, NoLock);
 				continue;
@@ -264,7 +258,40 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->indexcollations[i] = indexRelation->rd_indcollation[i];
 			}
 
-			info->relam = indexRelation->rd_rel->relam;
+			/*
+			 * Fetch the index expressions and predicate, if any.  We must
+			 * modify the copies we obtain from the relcache to have the
+			 * correct varno for the parent relation, so that they match up
+			 * correctly against qual clauses.
+			 */
+			info->indexprs = RelationGetIndexExpressions(indexRelation);
+			info->indpred = RelationGetIndexPredicate(indexRelation);
+			if (info->indexprs && varno != 1)
+				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
+			if (info->indpred && varno != 1)
+				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
+
+			info->unique = index->indisunique;
+			info->immediate = index->indimmediate;
+
+			/*
+			 * Don't add partitioned indexes to the indexlist, add them to the
+			 * partedIndexlist instead, since they are not usable by the
+			 * executor.
+			 */
+			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
+			{
+				index_close(indexRelation, NoLock);
+				partedIndexinfos = lappend(partedIndexinfos, info);
+				continue;
+			}
+
+			info->hypothetical = false;
+			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
+			info->predOK = false;	/* set later, in indxpath.c */
+
+			/* Build targetlist using the completed indexprs data */
+			info->indextlist = build_index_tlist(root, info, relation);
 
 			/* We copy just the fields we need, not all of rd_indam */
 			amroutine = indexRelation->rd_indam;
@@ -284,6 +311,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			/* Fetch index opclass options */
 			info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
 
+			info->relam = indexRelation->rd_rel->relam;
+
 			/*
 			 * Fetch the ordering information for the index, if any.
 			 */
@@ -370,28 +399,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				info->nulls_first = NULL;
 			}
 
-			/*
-			 * Fetch the index expressions and predicate, if any.  We must
-			 * modify the copies we obtain from the relcache to have the
-			 * correct varno for the parent relation, so that they match up
-			 * correctly against qual clauses.
-			 */
-			info->indexprs = RelationGetIndexExpressions(indexRelation);
-			info->indpred = RelationGetIndexPredicate(indexRelation);
-			if (info->indexprs && varno != 1)
-				ChangeVarNodes((Node *) info->indexprs, 1, varno, 0);
-			if (info->indpred && varno != 1)
-				ChangeVarNodes((Node *) info->indpred, 1, varno, 0);
-
-			/* Build targetlist using the completed indexprs data */
-			info->indextlist = build_index_tlist(root, info, relation);
-
-			info->indrestrictinfo = NIL;	/* set later, in indxpath.c */
-			info->predOK = false;	/* set later, in indxpath.c */
-			info->unique = index->indisunique;
-			info->immediate = index->indimmediate;
-			info->hypothetical = false;
-
 			/*
 			 * Estimate the index size.  If it's not a partial index, we lock
 			 * the number-of-tuples estimate to equal the parent table; if it
@@ -441,6 +448,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 	}
 
 	rel->indexlist = indexinfos;
+	rel->partedIndexlist = partedIndexinfos;
 
 	rel->statlist = get_relation_statistics(rel, relation);
 
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 3b065139e6..b23bbe2097 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -871,6 +871,8 @@ typedef struct RelOptInfo
 	Relids		lateral_referencers;
 	/* list of IndexOptInfo */
 	List	   *indexlist;
+	/* list of IndexOptInfo for partioned indexes */
+	List	   *partedIndexlist;
 	/* list of StatisticExtInfo */
 	List	   *statlist;
 	/* size estimates derived from pg_class */
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index 03926a8413..522cf3ec94 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -4855,14 +4855,42 @@ SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON (t1.a = t2.a AND t1.b = t2
 CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
+CREATE TABLE fract_x (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
+CREATE TABLE fract_x0 PARTITION OF fract_x FOR VALUES FROM ('0') TO ('1000');
+CREATE TABLE fract_x1 PARTITION OF fract_x FOR VALUES FROM ('1000') TO ('2000');
 -- insert data
 INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
 ANALYZE fract_t;
--- verify plan; nested index only scans
+INSERT INTO fract_x (id) (SELECT generate_series(0, 1999));
+ANALYZE fract_x;
+SET max_parallel_workers_per_gather = 0;
+SET enable_partitionwise_join = on;
+-- verify partition pruning
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Limit
+   ->  Append
+         ->  Index Only Scan using fract_x0_pkey on fract_x0 x_1
+         ->  Index Only Scan using fract_x1_pkey on fract_x1 x_2
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Limit
+   ->  Append
+         ->  Index Only Scan using fract_x0_pkey on fract_x0 x_1
+         ->  Index Only Scan using fract_x1_pkey on fract_x1 x_2
+(4 rows)
+
+-- verify plan; nested index only scans
+EXPLAIN (COSTS OFF)
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
                               QUERY PLAN                               
 -----------------------------------------------------------------------
  Limit
@@ -4870,32 +4898,33 @@ SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
          Sort Key: x.id
          ->  Merge Left Join
                Merge Cond: (x_1.id = y_1.id)
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 x_1
+               ->  Index Only Scan using fract_x0_pkey on fract_x0 x_1
                ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
          ->  Merge Left Join
                Merge Cond: (x_2.id = y_2.id)
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 x_2
+               ->  Index Only Scan using fract_x1_pkey on fract_x1 x_2
                ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
 (11 rows)
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
                                    QUERY PLAN                                   
 --------------------------------------------------------------------------------
  Limit
    ->  Merge Append
          Sort Key: x.id DESC
          ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t0_pkey on fract_t0 x_1
+               ->  Index Only Scan Backward using fract_x0_pkey on fract_x0 x_1
                ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
                      Index Cond: (id = x_1.id)
          ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t1_pkey on fract_t1 x_2
+               ->  Index Only Scan Backward using fract_x1_pkey on fract_x1 x_2
                ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
                      Index Cond: (id = x_2.id)
 (11 rows)
 
 -- cleanup
 DROP TABLE fract_t;
+DROP TABLE fract_x;
 RESET max_parallel_workers_per_gather;
 RESET enable_partitionwise_join;
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 67f506361f..39e23911d0 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -1148,22 +1148,38 @@ CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
 
+CREATE TABLE fract_x (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
+CREATE TABLE fract_x0 PARTITION OF fract_x FOR VALUES FROM ('0') TO ('1000');
+CREATE TABLE fract_x1 PARTITION OF fract_x FOR VALUES FROM ('1000') TO ('2000');
 -- insert data
 INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
 ANALYZE fract_t;
+INSERT INTO fract_x (id) (SELECT generate_series(0, 1999));
+ANALYZE fract_x;
 
--- verify plan; nested index only scans
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 
+-- verify partition pruning
+SET max_parallel_workers_per_gather = 0;
+SET enable_partitionwise_join = on;
+
+EXPLAIN (COSTS OFF)
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+
+EXPLAIN (COSTS OFF)
+SELECT x.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+
+-- verify plan; nested index only scans
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_x x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
 
 -- cleanup
 DROP TABLE fract_t;
+DROP TABLE fract_x;
 
 RESET max_parallel_workers_per_gather;
 RESET enable_partitionwise_join;

#18

dgrowleyml@gmail.com

over 3 years ago

In reply to: Arne Roland (#17)

1 attachment(s)

Re: missing indexes in indexlist with partitioned tables

On Wed, 3 Aug 2022 at 11:07, Arne Roland <A.Roland@index.de> wrote:

Attached a rebased version of the patch.

Firstly, I agree that we should fix the issue of join removals not
working with partitioned tables.

I had a quick look over this and the first thing that I thought was
the same as what Amit mentioned in:

On Tue, 25 Jan 2022 at 21:04, Amit Langote <amitlangote09@gmail.com> wrote:

Finally, I don't understand why we need a separate field to store
indexes found in partitioned base relations. AFAICS, nothing but the
sites you are interested in (relation_has_unique_index_for() and
rel_supports_distinctness()) would ever bother to look at a
partitioned base relation's indexlist. Do you think putting them into
in indexlist might break something?

I kinda disagree with Alvaro's fix in 05fb5d661. I think indexlist is
the place to store these details. That commit added the following
comment:

/*
* Ignore partitioned indexes, since they are not usable for
* queries.
*/

But neither are hypothetical indexes either, yet they're added to
RelOptInfo.indexlist.

I think the patch should be changed so that the existing list is used
and we find another fix for the problems Alvaro fixed in 05fb5d661.
Unfortunately, there was no discussion marked on that commit message,
so it's not quite clear what the problem was. I'm unsure if there was
anything other than CLUSTER that was broken. I see that cfdd03f45
added CLUSTER for partitioned tables in v15. I think the patch would
need to go over the usages of RelOptInfo.indexlist to make sure that
we don't need to add any further conditions to skip their usage for
partitioned tables.

I wrote the attached patch as I wanted to see what would break if we
did this. The only problem I got from running make check was in
get_actual_variable_range(), so I just changed that so it returns
false when the given rel is a partitioned table. I only quickly did
the changes to get_relation_info() and didn't give much thought to
what the can* bool flags should be set to. I just mostly skipped all
that code because it was crashing on
relation->rd_tableam->scan_bitmap_next_block due to the rd_tableam
being NULL.

Also, just a friendly tip, Arne; I saw you named your patch 0006 for
version 6. You'll see many 000n patches around the list, but those
are generally done with git format-patch. That number normally means
the patch in the patch series, not the version of the patch. I'm not
sure if it'll help any, but my workflow for writing new patches
against master tends to be:

git checkout master
git checkout -b some_feature_branch
# write some code
git commit -a
# maybe more code
git commit -a
git format-patch -v1 master

That'll create v1-0001 and v1-0002 patches. When I'm onto v2, I just
change the version number from -v1 to -v2.

David

Attachments:

v7_include_partitioned_indexes_in_indexlist.patchtext/plain; charset=US-ASCII; name=v7_include_partitioned_indexes_in_indexlist.patchDownload

diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 5012bfe142..a2b7667fc7 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -156,10 +156,10 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
 	/*
 	 * Make list of indexes.  Ignore indexes on system catalogs if told to.
-	 * Don't bother with indexes for an inheritance parent, either.
+	 * Don't bother with indexes from traditional inheritance parents, either.
 	 */
-	if (inhparent ||
-		(IgnoreSystemIndexes && IsSystemRelation(relation)))
+	if ((inhparent && relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		|| (IgnoreSystemIndexes && IsSystemRelation(relation)))
 		hasindex = false;
 	else
 		hasindex = relation->rd_rel->relhasindex;
@@ -212,16 +212,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				continue;
 			}
 
-			/*
-			 * Ignore partitioned indexes, since they are not usable for
-			 * queries.
-			 */
-			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
-			{
-				index_close(indexRelation, NoLock);
-				continue;
-			}
-
 			/*
 			 * If the index is valid, but cannot yet be used, ignore it; but
 			 * mark the plan we are generating as transient. See
@@ -266,108 +256,127 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
 			info->relam = indexRelation->rd_rel->relam;
 
-			/* We copy just the fields we need, not all of rd_indam */
-			amroutine = indexRelation->rd_indam;
-			info->amcanorderbyop = amroutine->amcanorderbyop;
-			info->amoptionalkey = amroutine->amoptionalkey;
-			info->amsearcharray = amroutine->amsearcharray;
-			info->amsearchnulls = amroutine->amsearchnulls;
-			info->amcanparallel = amroutine->amcanparallel;
-			info->amhasgettuple = (amroutine->amgettuple != NULL);
-			info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
-				relation->rd_tableam->scan_bitmap_next_block != NULL;
-			info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
-								  amroutine->amrestrpos != NULL);
-			info->amcostestimate = amroutine->amcostestimate;
-			Assert(info->amcostestimate != NULL);
-
-			/* Fetch index opclass options */
-			info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
-
-			/*
-			 * Fetch the ordering information for the index, if any.
-			 */
-			if (info->relam == BTREE_AM_OID)
+			if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
 			{
+				/* We copy just the fields we need, not all of rd_indam */
+				amroutine = indexRelation->rd_indam;
+				info->amcanorderbyop = amroutine->amcanorderbyop;
+				info->amoptionalkey = amroutine->amoptionalkey;
+				info->amsearcharray = amroutine->amsearcharray;
+				info->amsearchnulls = amroutine->amsearchnulls;
+				info->amcanparallel = amroutine->amcanparallel;
+				info->amhasgettuple = (amroutine->amgettuple != NULL);
+				info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
+					relation->rd_tableam->scan_bitmap_next_block != NULL;
+				info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
+					amroutine->amrestrpos != NULL);
+				info->amcostestimate = amroutine->amcostestimate;
+				Assert(info->amcostestimate != NULL);
+
+				/* Fetch index opclass options */
+				info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
+
 				/*
-				 * If it's a btree index, we can use its opfamily OIDs
-				 * directly as the sort ordering opfamily OIDs.
+				 * Fetch the ordering information for the index, if any.
 				 */
-				Assert(amroutine->amcanorder);
-
-				info->sortopfamily = info->opfamily;
-				info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-				info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
-
-				for (i = 0; i < nkeycolumns; i++)
+				if (info->relam == BTREE_AM_OID)
 				{
-					int16		opt = indexRelation->rd_indoption[i];
+					/*
+					 * If it's a btree index, we can use its opfamily OIDs
+					 * directly as the sort ordering opfamily OIDs.
+					 */
+					Assert(amroutine->amcanorder);
 
-					info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-					info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-				}
-			}
-			else if (amroutine->amcanorder)
-			{
-				/*
-				 * Otherwise, identify the corresponding btree opfamilies by
-				 * trying to map this index's "<" operators into btree.  Since
-				 * "<" uniquely defines the behavior of a sort order, this is
-				 * a sufficient test.
-				 *
-				 * XXX This method is rather slow and also requires the
-				 * undesirable assumption that the other index AM numbers its
-				 * strategies the same as btree.  It'd be better to have a way
-				 * to explicitly declare the corresponding btree opfamily for
-				 * each opfamily of the other index type.  But given the lack
-				 * of current or foreseeable amcanorder index types, it's not
-				 * worth expending more effort on now.
-				 */
-				info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
-				info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-				info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
+					info->sortopfamily = info->opfamily;
+					info->reverse_sort = (bool *)palloc(sizeof(bool) * nkeycolumns);
+					info->nulls_first = (bool *)palloc(sizeof(bool) * nkeycolumns);
 
-				for (i = 0; i < nkeycolumns; i++)
-				{
-					int16		opt = indexRelation->rd_indoption[i];
-					Oid			ltopr;
-					Oid			btopfamily;
-					Oid			btopcintype;
-					int16		btstrategy;
-
-					info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-					info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-
-					ltopr = get_opfamily_member(info->opfamily[i],
-												info->opcintype[i],
-												info->opcintype[i],
-												BTLessStrategyNumber);
-					if (OidIsValid(ltopr) &&
-						get_ordering_op_properties(ltopr,
-												   &btopfamily,
-												   &btopcintype,
-												   &btstrategy) &&
-						btopcintype == info->opcintype[i] &&
-						btstrategy == BTLessStrategyNumber)
+					for (i = 0; i < nkeycolumns; i++)
 					{
-						/* Successful mapping */
-						info->sortopfamily[i] = btopfamily;
+						int16		opt = indexRelation->rd_indoption[i];
+
+						info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+						info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
 					}
-					else
+				}
+				else if (amroutine->amcanorder)
+				{
+					/*
+					 * Otherwise, identify the corresponding btree opfamilies by
+					 * trying to map this index's "<" operators into btree.  Since
+					 * "<" uniquely defines the behavior of a sort order, this is
+					 * a sufficient test.
+					 *
+					 * XXX This method is rather slow and also requires the
+					 * undesirable assumption that the other index AM numbers its
+					 * strategies the same as btree.  It'd be better to have a way
+					 * to explicitly declare the corresponding btree opfamily for
+					 * each opfamily of the other index type.  But given the lack
+					 * of current or foreseeable amcanorder index types, it's not
+					 * worth expending more effort on now.
+					 */
+					info->sortopfamily = (Oid *)palloc(sizeof(Oid) * nkeycolumns);
+					info->reverse_sort = (bool *)palloc(sizeof(bool) * nkeycolumns);
+					info->nulls_first = (bool *)palloc(sizeof(bool) * nkeycolumns);
+
+					for (i = 0; i < nkeycolumns; i++)
 					{
-						/* Fail ... quietly treat index as unordered */
-						info->sortopfamily = NULL;
-						info->reverse_sort = NULL;
-						info->nulls_first = NULL;
-						break;
+						int16		opt = indexRelation->rd_indoption[i];
+						Oid			ltopr;
+						Oid			btopfamily;
+						Oid			btopcintype;
+						int16		btstrategy;
+
+						info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+						info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
+
+						ltopr = get_opfamily_member(info->opfamily[i],
+							info->opcintype[i],
+							info->opcintype[i],
+							BTLessStrategyNumber);
+						if (OidIsValid(ltopr) &&
+							get_ordering_op_properties(ltopr,
+								&btopfamily,
+								&btopcintype,
+								&btstrategy) &&
+							btopcintype == info->opcintype[i] &&
+							btstrategy == BTLessStrategyNumber)
+						{
+							/* Successful mapping */
+							info->sortopfamily[i] = btopfamily;
+						}
+						else
+						{
+							/* Fail ... quietly treat index as unordered */
+							info->sortopfamily = NULL;
+							info->reverse_sort = NULL;
+							info->nulls_first = NULL;
+							break;
+						}
 					}
 				}
+				else
+				{
+					info->sortopfamily = NULL;
+					info->reverse_sort = NULL;
+					info->nulls_first = NULL;
+				}
 			}
 			else
 			{
 				info->sortopfamily = NULL;
 				info->reverse_sort = NULL;
 				info->nulls_first = NULL;
+
+				info->amcanorderbyop = false;
+				info->amoptionalkey = false;
+				info->amsearcharray = false;
+				info->amsearchnulls = false;
+				info->amcanparallel = false;
+				info->amhasgettuple = false;
+				info->amhasgetbitmap = false;
+				info->amcanmarkpos = false;
+				info->amcostestimate = NULL;
 			}
 
 			/*
@@ -414,7 +423,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 					info->tuples = rel->tuples;
 			}
 
-			if (info->relam == BTREE_AM_OID)
+			if (info->relam == BTREE_AM_OID &&
+				indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
 			{
 				/* For btrees, get tree height while we have the index open */
 				info->tree_height = _bt_getrootheight(indexRelation);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e898ffad7b..100b76c0da 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2971,9 +2971,11 @@ RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
 		return smgrnblocks(RelationGetSmgr(relation), forkNum);
 	}
 	else
-		Assert(false);
+	{
+		Assert(relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX);
+	}
 
-	return 0;					/* keep compiler quiet */
+	return 0;
 }
 
 /*
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index c746759eef..7d721755a2 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -5995,6 +5995,10 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
 	rte = root->simple_rte_array[rel->relid];
 	Assert(rte->rtekind == RTE_RELATION);
 
+	/* ignore partitioned tables.  Any indexes here are not real indexes */
+	if (rte->relkind == RELKIND_PARTITIONED_TABLE)
+		return false;
+
 	/* Search through the indexes to see if any match our problem */
 	foreach(lc, rel->indexlist)
 	{
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index 03926a8413..c854228482 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -4852,46 +4852,46 @@ SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON (t1.a = t2.a AND t1.b = t2
 (8 rows)
 
 -- partitionwise join with fractional paths
-CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
+CREATE TABLE fract_t (id BIGINT, c INT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
 -- insert data
-INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
+INSERT INTO fract_t (id,c) SELECT i,i FROM generate_series(0, 1999) i;
 ANALYZE fract_t;
--- verify plan; nested index only scans
+-- verify plan; nested index scans
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 EXPLAIN (COSTS OFF)
 SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Limit
    ->  Merge Append
          Sort Key: x.id
          ->  Merge Left Join
                Merge Cond: (x_1.id = y_1.id)
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 x_1
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
+               ->  Index Scan using fract_t0_pkey on fract_t0 x_1
+               ->  Index Scan using fract_t0_pkey on fract_t0 y_1
          ->  Merge Left Join
                Merge Cond: (x_2.id = y_2.id)
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 x_2
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
+               ->  Index Scan using fract_t1_pkey on fract_t1 x_2
+               ->  Index Scan using fract_t1_pkey on fract_t1 y_2
 (11 rows)
 
 EXPLAIN (COSTS OFF)
 SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
-                                   QUERY PLAN                                   
---------------------------------------------------------------------------------
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
  Limit
    ->  Merge Append
          Sort Key: x.id DESC
          ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t0_pkey on fract_t0 x_1
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
+               ->  Index Scan Backward using fract_t0_pkey on fract_t0 x_1
+               ->  Index Scan using fract_t0_pkey on fract_t0 y_1
                      Index Cond: (id = x_1.id)
          ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t1_pkey on fract_t1 x_2
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
+               ->  Index Scan Backward using fract_t1_pkey on fract_t1 x_2
+               ->  Index Scan using fract_t1_pkey on fract_t1 y_2
                      Index Cond: (id = x_2.id)
 (11 rows)
 
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 67f506361f..009184d348 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -1144,15 +1144,15 @@ SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON (t1.a = t2.a AND t1.b = t2
 SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON (t1.a = t2.a AND t1.b = t2.b AND t1.c = t2.c) WHERE ((t1.b >= 100 AND t1.b < 110) OR (t1.b >= 200 AND t1.b < 210)) AND ((t2.b >= 100 AND t2.b < 110) OR (t2.b >= 200 AND t2.b < 210)) AND t1.c IN ('0004', '0009') ORDER BY t1.a, t1.b;
 
 -- partitionwise join with fractional paths
-CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
+CREATE TABLE fract_t (id BIGINT, c INT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
 
 -- insert data
-INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
+INSERT INTO fract_t (id,c) SELECT i,i FROM generate_series(0, 1999) i;
 ANALYZE fract_t;
 
--- verify plan; nested index only scans
+-- verify plan; nested index scans
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;

#19

alvherre@alvh.no-ip.org

over 3 years ago

In reply to: David Rowley (#18)

Re: missing indexes in indexlist with partitioned tables

On 2022-Sep-16, David Rowley wrote:

I kinda disagree with Alvaro's fix in 05fb5d661. I think indexlist is
the place to store these details. That commit added the following
comment:

/*
* Ignore partitioned indexes, since they are not usable for
* queries.
*/

But neither are hypothetical indexes either, yet they're added to
RelOptInfo.indexlist.

I think the patch should be changed so that the existing list is used
and we find another fix for the problems Alvaro fixed in 05fb5d661.
Unfortunately, there was no discussion marked on that commit message,
so it's not quite clear what the problem was. I'm unsure if there was
anything other than CLUSTER that was broken.

After a bit of trawling through the archives, I found it here:
/messages/by-id/20180124162006.pmapfiznhgngwtjf@alvherre.pgsql
I think there was insufficient discussion and you're probably right that
it wasn't the best fix. I don't object to finding another fix.

--
Álvaro Herrera 48°01'N 7°57'E — https://www.EnterpriseDB.com/

#20

Tom Lane

tgl@sss.pgh.pa.us

over 3 years ago

In reply to: Alvaro Herrera (#19)

Re: missing indexes in indexlist with partitioned tables

Alvaro Herrera <alvherre@alvh.no-ip.org> writes:

On 2022-Sep-16, David Rowley wrote:

I kinda disagree with Alvaro's fix in 05fb5d661. I think indexlist is
the place to store these details. That commit added the following
comment:

/*
* Ignore partitioned indexes, since they are not usable for
* queries.
*/

But neither are hypothetical indexes either, yet they're added to
RelOptInfo.indexlist.

I think the patch should be changed so that the existing list is used
and we find another fix for the problems Alvaro fixed in 05fb5d661.
Unfortunately, there was no discussion marked on that commit message,
so it's not quite clear what the problem was. I'm unsure if there was
anything other than CLUSTER that was broken.

After a bit of trawling through the archives, I found it here:
/messages/by-id/20180124162006.pmapfiznhgngwtjf@alvherre.pgsql
I think there was insufficient discussion and you're probably right that
it wasn't the best fix. I don't object to finding another fix.

FWIW, I don't see any big problem with what you did. We'd need to
do something more like what David suggests if the planner ever has
a reason to consider partitioned indexes. But as long as it does
not, why expend the time to build data structures representing them?
And we'd have to add code in quite a few places to ignore them,
once they're in indexlist.

regards, tom lane

#21

[1]: /messages/by-id/2641568c18de40e8b1528fc9d4d80127@index.de

dgrowleyml@gmail.com

over 3 years ago

In reply to: Tom Lane (#20)

Re: missing indexes in indexlist with partitioned tables

On Sun, 18 Sept 2022 at 07:00, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Alvaro Herrera <alvherre@alvh.no-ip.org> writes:

After a bit of trawling through the archives, I found it here:
/messages/by-id/20180124162006.pmapfiznhgngwtjf@alvherre.pgsql
I think there was insufficient discussion and you're probably right that
it wasn't the best fix. I don't object to finding another fix.

FWIW, I don't see any big problem with what you did. We'd need to
do something more like what David suggests if the planner ever has
a reason to consider partitioned indexes. But as long as it does
not, why expend the time to build data structures representing them?

Did you miss the report about left join removals not working with
partitioned tables due to lack of unique proofs? That seems like a
good enough reason to me.

And we'd have to add code in quite a few places to ignore them,
once they're in indexlist.

I think the same is true for "hypothetical" indexes. Maybe that would
be a good field to grep on to find the places that need to be
addressed.

David

#22

Amit Langote

amitlangote09@gmail.com

over 3 years ago

In reply to: David Rowley (#18)

Re: missing indexes in indexlist with partitioned tables

On Fri, Sep 16, 2022 at 1:08 PM David Rowley <dgrowleyml@gmail.com> wrote:

On Wed, 3 Aug 2022 at 11:07, Arne Roland <A.Roland@index.de> wrote:

Attached a rebased version of the patch.

Firstly, I agree that we should fix the issue of join removals not
working with partitioned tables.

Agreed, though the patch's changes to tests does not seem to have to
do with join removal? I don't really understand what the test changes
are all about. I wonder why the patch doesn't instead add the test
case that Arne showed in the file he attached with [1]/messages/by-id/2641568c18de40e8b1528fc9d4d80127@index.de.

I think the patch should be changed so that the existing list is used
and we find another fix for the problems Alvaro fixed in 05fb5d661.
Unfortunately, there was no discussion marked on that commit message,
so it's not quite clear what the problem was. I'm unsure if there was
anything other than CLUSTER that was broken. I see that cfdd03f45
added CLUSTER for partitioned tables in v15. I think the patch would
need to go over the usages of RelOptInfo.indexlist to make sure that
we don't need to add any further conditions to skip their usage for
partitioned tables.

I wrote the attached patch as I wanted to see what would break if we
did this. The only problem I got from running make check was in
get_actual_variable_range(), so I just changed that so it returns
false when the given rel is a partitioned table. I only quickly did
the changes to get_relation_info() and didn't give much thought to
what the can* bool flags should be set to. I just mostly skipped all
that code because it was crashing on
relation->rd_tableam->scan_bitmap_next_block due to the rd_tableam
being NULL.

Yeah, it makes sense to just skip the portion of the code that reads
from rd_indam, as your patch does.

+           if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
            {
+               /* We copy just the fields we need, not all of rd_indam */
+               amroutine = indexRelation->rd_indam;

Maybe you're intending to add one before committing but there should
be a comment mentioning why the am* initializations are being skipped
over for partitioned index IndexOptInfos. I'd think that's because we
are not going to be building any paths using them for now.

The following portion of the top comment of get_relation_info()
perhaps needs an update.

* If inhparent is true, all we need to do is set up the attr arrays:
* the RelOptInfo actually represents the appendrel formed by an inheritance
* tree, and so the parent rel's physical size and index information isn't
* important for it.
*/

--
Thanks, Amit Langote
EDB: http://www.enterprisedb.com

#23

dgrowleyml@gmail.com

over 3 years ago

In reply to: Amit Langote (#22)

Re: missing indexes in indexlist with partitioned tables

Thank you for having a look at the patch.

On Tue, 20 Sept 2022 at 18:41, Amit Langote <amitlangote09@gmail.com> wrote:

Agreed, though the patch's changes to tests does not seem to have to
do with join removal? I don't really understand what the test changes
are all about. I wonder why the patch doesn't instead add the test
case that Arne showed in the file he attached with [1].
[1] /messages/by-id/2641568c18de40e8b1528fc9d4d80127@index.de

I adjusted a test in partition_join.sql to add an additional column to
the fract_t table. Before the change that table only had a single
column and due to the query's join condition being USING(id), none of
the columns from the left joined table were being used. That resulted
in the updated code performing a left join removal as it was passing
the checks for no columns being used in the left joined table in
analyzejoins.c. The test in partition_join.sql claims to be testing
"partitionwise join with fractional paths", so I thought we'd better
not have a query that the planner removes the join when we're meant to
be testing joins.

It probably wouldn't hurt to have a new test to ensure left join
removals work with a partitioned table. That should go in join.sql
along with the other join removal tests. I didn't study Arne's patch
to see what test he added. I was only interested in writing enough
code so I could check there was no good reason not to add the
partitioned index into RelOptInfo.indexlist. Arne sent me an off-list
message to say he's planning on working on the patch that uses the
existing field instead of the new one he originally added. Let's hold
off for that patch.

David

#24

Amit Langote

amitlangote09@gmail.com

over 3 years ago

In reply to: David Rowley (#23)

Re: missing indexes in indexlist with partitioned tables

On Tue, Sep 20, 2022 at 4:53 PM David Rowley <dgrowleyml@gmail.com> wrote:

Thank you for having a look at the patch.

On Tue, 20 Sept 2022 at 18:41, Amit Langote <amitlangote09@gmail.com> wrote:

Agreed, though the patch's changes to tests does not seem to have to
do with join removal? I don't really understand what the test changes
are all about. I wonder why the patch doesn't instead add the test
case that Arne showed in the file he attached with [1].
[1] /messages/by-id/2641568c18de40e8b1528fc9d4d80127@index.de

I adjusted a test in partition_join.sql to add an additional column to
the fract_t table. Before the change that table only had a single
column and due to the query's join condition being USING(id), none of
the columns from the left joined table were being used. That resulted
in the updated code performing a left join removal as it was passing
the checks for no columns being used in the left joined table in
analyzejoins.c. The test in partition_join.sql claims to be testing
"partitionwise join with fractional paths", so I thought we'd better
not have a query that the planner removes the join when we're meant to
be testing joins.

Ah, got it, thanks for the explanation.

It probably wouldn't hurt to have a new test to ensure left join
removals work with a partitioned table. That should go in join.sql
along with the other join removal tests.

Makes sense.

I didn't study Arne's patch
to see what test he added. I was only interested in writing enough
code so I could check there was no good reason not to add the
partitioned index into RelOptInfo.indexlist. Arne sent me an off-list
message to say he's planning on working on the patch that uses the
existing field instead of the new one he originally added. Let's hold
off for that patch.

Ok, sure.

--
Thanks, Amit Langote
EDB: http://www.enterprisedb.com

#25

A.Roland@index.de

over 3 years ago

In reply to: Amit Langote (#24)

1 attachment(s)

Re: missing indexes in indexlist with partitioned tables

On Tue, Sep 20, 2022 at 4:53 PM David Rowley <dgrowleyml@gmail.com> wrote:

Arne sent me an off-list
message to say he's planning on working on the patch that uses the
existing field instead of the new one he originally added. Let's hold
off for that patch.

I wouldn't say, I explicitly stated that. But I ended up doing something, that resulted in the attached patch. :)

For my own sanity I greped one last time for the usage of indexlist.

Most of the (untouched) usages have comments that, they are only called for baserels/plain tables.

Namely all but the cluster of partitioned tables. I had to reread that section. There we are just traversing the tree and omitting partitioned tables.

There is now a test section in join.sql for partitioned tables, that tests very similar to the

baserel case. That's more thorough, than what I originally went for.

Further feedback would be appreciated!

Regards
Arne

Attachments:

v8_indexlist_contains_partitioned_indexes-0001-v8.patchtext/x-patch; name=v8_indexlist_contains_partitioned_indexes-0001-v8.patchDownload

From 290252bab5837c1a6f42bd53cf788c8696d5d0ec Mon Sep 17 00:00:00 2001
From: Arne Roland <arne.roland@index.de>
Date: Sat, 1 Oct 2022 18:14:34 +0200
Subject: [PATCH v8_indexlist_contains_partitioned_indexes] v8

---
 src/backend/optimizer/util/plancat.c         | 226 ++++++++++---------
 src/backend/storage/buffer/bufmgr.c          |   6 +-
 src/backend/utils/adt/selfuncs.c             |   4 +
 src/test/regress/expected/inherit.out        |   6 +
 src/test/regress/expected/join.out           | 124 +++++++++-
 src/test/regress/expected/partition_join.out |  30 +--
 src/test/regress/sql/inherit.sql             |   2 +
 src/test/regress/sql/join.sql                |  69 +++++-
 src/test/regress/sql/partition_join.sql      |   6 +-
 9 files changed, 348 insertions(+), 125 deletions(-)

diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 6d5718ee4c..7545ae862d 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -106,10 +106,12 @@ static void set_baserel_partition_constraint(Relation relation,
  * cases these are left as zeroes, but sometimes we need to compute attr
  * widths here, and we may as well cache the results for costsize.c.
  *
- * If inhparent is true, all we need to do is set up the attr arrays:
- * the RelOptInfo actually represents the appendrel formed by an inheritance
- * tree, and so the parent rel's physical size and index information isn't
- * important for it.
+ * If inhparent is true, we don't care about physical size, index am
+ * and estimates.
+ * We still need index uniqueness for join removal.
+ * When it comes to the path, the RelOptInfo actually represents
+ * the appendrel formed by an inheritance tree, and so the parent
+ * rel's physical size isn't important for it.
  */
 void
 get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
@@ -157,10 +159,12 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
 	/*
 	 * Make list of indexes.  Ignore indexes on system catalogs if told to.
-	 * Don't bother with indexes for an inheritance parent, either.
+	 * Don't bother with indexes from traditional inheritance parents.
+	 * We care about partitioned indexes for join pruning,
+	 * even if we don't have any am for them.
 	 */
-	if (inhparent ||
-		(IgnoreSystemIndexes && IsSystemRelation(relation)))
+	if ((inhparent && relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		|| (IgnoreSystemIndexes && IsSystemRelation(relation)))
 		hasindex = false;
 	else
 		hasindex = relation->rd_rel->relhasindex;
@@ -191,8 +195,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			IndexAmRoutine *amroutine;
 			IndexOptInfo *info;
 			int			ncolumns,
-						nkeycolumns;
-			int			i;
+						nkeycolumns,
+			            i;
 
 			/*
 			 * Extract info from the relation descriptor for the index.
@@ -213,16 +217,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				continue;
 			}
 
-			/*
-			 * Ignore partitioned indexes, since they are not usable for
-			 * queries.
-			 */
-			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
-			{
-				index_close(indexRelation, NoLock);
-				continue;
-			}
-
 			/*
 			 * If the index is valid, but cannot yet be used, ignore it; but
 			 * mark the plan we are generating as transient. See
@@ -267,108 +261,133 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
 			info->relam = indexRelation->rd_rel->relam;
 
-			/* We copy just the fields we need, not all of rd_indam */
-			amroutine = indexRelation->rd_indam;
-			info->amcanorderbyop = amroutine->amcanorderbyop;
-			info->amoptionalkey = amroutine->amoptionalkey;
-			info->amsearcharray = amroutine->amsearcharray;
-			info->amsearchnulls = amroutine->amsearchnulls;
-			info->amcanparallel = amroutine->amcanparallel;
-			info->amhasgettuple = (amroutine->amgettuple != NULL);
-			info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
-				relation->rd_tableam->scan_bitmap_next_block != NULL;
-			info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
-								  amroutine->amrestrpos != NULL);
-			info->amcostestimate = amroutine->amcostestimate;
-			Assert(info->amcostestimate != NULL);
-
-			/* Fetch index opclass options */
-			info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
-
 			/*
-			 * Fetch the ordering information for the index, if any.
+			 * We don't have am for partitioned indexes, therefore we skip
+			 * gathering the info. This is ok, because we don't use
+			 * partitioned indexes in paths. We resolve inheritance before
+			 * generating paths.
 			 */
-			if (info->relam == BTREE_AM_OID)
+			if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
 			{
+				/* We copy just the fields we need, not all of rd_indam */
+				amroutine = indexRelation->rd_indam;
+				info->amcanorderbyop = amroutine->amcanorderbyop;
+				info->amoptionalkey = amroutine->amoptionalkey;
+				info->amsearcharray = amroutine->amsearcharray;
+				info->amsearchnulls = amroutine->amsearchnulls;
+				info->amcanparallel = amroutine->amcanparallel;
+				info->amhasgettuple = (amroutine->amgettuple != NULL);
+				info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
+					relation->rd_tableam->scan_bitmap_next_block != NULL;
+				info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
+					amroutine->amrestrpos != NULL);
+				info->amcostestimate = amroutine->amcostestimate;
+				Assert(info->amcostestimate != NULL);
+
+				/* Fetch index opclass options */
+				info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
+
 				/*
-				 * If it's a btree index, we can use its opfamily OIDs
-				 * directly as the sort ordering opfamily OIDs.
+				 * Fetch the ordering information for the index, if any.
 				 */
-				Assert(amroutine->amcanorder);
-
-				info->sortopfamily = info->opfamily;
-				info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-				info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
-
-				for (i = 0; i < nkeycolumns; i++)
+				if (info->relam == BTREE_AM_OID)
 				{
-					int16		opt = indexRelation->rd_indoption[i];
+					/*
+					 * If it's a btree index, we can use its opfamily OIDs
+					 * directly as the sort ordering opfamily OIDs.
+					 */
+					Assert(amroutine->amcanorder);
 
-					info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-					info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-				}
-			}
-			else if (amroutine->amcanorder)
-			{
-				/*
-				 * Otherwise, identify the corresponding btree opfamilies by
-				 * trying to map this index's "<" operators into btree.  Since
-				 * "<" uniquely defines the behavior of a sort order, this is
-				 * a sufficient test.
-				 *
-				 * XXX This method is rather slow and also requires the
-				 * undesirable assumption that the other index AM numbers its
-				 * strategies the same as btree.  It'd be better to have a way
-				 * to explicitly declare the corresponding btree opfamily for
-				 * each opfamily of the other index type.  But given the lack
-				 * of current or foreseeable amcanorder index types, it's not
-				 * worth expending more effort on now.
-				 */
-				info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
-				info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-				info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
+					info->sortopfamily = info->opfamily;
+					info->reverse_sort = (bool *)palloc(sizeof(bool) * nkeycolumns);
+					info->nulls_first = (bool *)palloc(sizeof(bool) * nkeycolumns);
 
-				for (i = 0; i < nkeycolumns; i++)
-				{
-					int16		opt = indexRelation->rd_indoption[i];
-					Oid			ltopr;
-					Oid			btopfamily;
-					Oid			btopcintype;
-					int16		btstrategy;
-
-					info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-					info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-
-					ltopr = get_opfamily_member(info->opfamily[i],
-												info->opcintype[i],
-												info->opcintype[i],
-												BTLessStrategyNumber);
-					if (OidIsValid(ltopr) &&
-						get_ordering_op_properties(ltopr,
-												   &btopfamily,
-												   &btopcintype,
-												   &btstrategy) &&
-						btopcintype == info->opcintype[i] &&
-						btstrategy == BTLessStrategyNumber)
+					for (i = 0; i < nkeycolumns; i++)
 					{
-						/* Successful mapping */
-						info->sortopfamily[i] = btopfamily;
+						int16		opt = indexRelation->rd_indoption[i];
+
+						info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+						info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
 					}
-					else
+				}
+				else if (amroutine->amcanorder)
+				{
+					/*
+					 * Otherwise, identify the corresponding btree opfamilies by
+					 * trying to map this index's "<" operators into btree.  Since
+					 * "<" uniquely defines the behavior of a sort order, this is
+					 * a sufficient test.
+					 *
+					 * XXX This method is rather slow and also requires the
+					 * undesirable assumption that the other index AM numbers its
+					 * strategies the same as btree.  It'd be better to have a way
+					 * to explicitly declare the corresponding btree opfamily for
+					 * each opfamily of the other index type.  But given the lack
+					 * of current or foreseeable amcanorder index types, it's not
+					 * worth expending more effort on now.
+					 */
+					info->sortopfamily = (Oid *)palloc(sizeof(Oid) * nkeycolumns);
+					info->reverse_sort = (bool *)palloc(sizeof(bool) * nkeycolumns);
+					info->nulls_first = (bool *)palloc(sizeof(bool) * nkeycolumns);
+
+					for (i = 0; i < nkeycolumns; i++)
 					{
-						/* Fail ... quietly treat index as unordered */
-						info->sortopfamily = NULL;
-						info->reverse_sort = NULL;
-						info->nulls_first = NULL;
-						break;
+						int16		opt = indexRelation->rd_indoption[i];
+						Oid			ltopr;
+						Oid			btopfamily;
+						Oid			btopcintype;
+						int16		btstrategy;
+
+						info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+						info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
+
+						ltopr = get_opfamily_member(info->opfamily[i],
+							info->opcintype[i],
+							info->opcintype[i],
+							BTLessStrategyNumber);
+						if (OidIsValid(ltopr) &&
+							get_ordering_op_properties(ltopr,
+								&btopfamily,
+								&btopcintype,
+								&btstrategy) &&
+							btopcintype == info->opcintype[i] &&
+							btstrategy == BTLessStrategyNumber)
+						{
+							/* Successful mapping */
+							info->sortopfamily[i] = btopfamily;
+						}
+						else
+						{
+							/* Fail ... quietly treat index as unordered */
+							info->sortopfamily = NULL;
+							info->reverse_sort = NULL;
+							info->nulls_first = NULL;
+							break;
+						}
 					}
 				}
+				else
+				{
+					info->sortopfamily = NULL;
+					info->reverse_sort = NULL;
+					info->nulls_first = NULL;
+				}
 			}
 			else
 			{
 				info->sortopfamily = NULL;
 				info->reverse_sort = NULL;
 				info->nulls_first = NULL;
+
+				info->amcanorderbyop = false;
+				info->amoptionalkey = false;
+				info->amsearcharray = false;
+				info->amsearchnulls = false;
+				info->amcanparallel = false;
+				info->amhasgettuple = false;
+				info->amhasgetbitmap = false;
+				info->amcanmarkpos = false;
+				info->amcostestimate = NULL;
 			}
 
 			/*
@@ -415,7 +434,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 					info->tuples = rel->tuples;
 			}
 
-			if (info->relam == BTREE_AM_OID)
+			if (info->relam == BTREE_AM_OID &&
+				indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
 			{
 				/* For btrees, get tree height while we have the index open */
 				info->tree_height = _bt_getrootheight(indexRelation);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 5b0e531f97..91df65414b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2971,9 +2971,11 @@ RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
 		return smgrnblocks(RelationGetSmgr(relation), forkNum);
 	}
 	else
-		Assert(false);
+	{
+		Assert(relation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX);
+	}
 
-	return 0;					/* keep compiler quiet */
+	return 0;
 }
 
 /*
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 1808388397..f1a97132bd 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -5995,6 +5995,10 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
 	rte = root->simple_rte_array[rel->relid];
 	Assert(rte->rtekind == RTE_RELATION);
 
+	/* ignore partitioned tables.  Any indexes here are not real indexes */
+	if (rte->relkind == RELKIND_PARTITIONED_TABLE)
+		return false;
+
 	/* Search through the indexes to see if any match our problem */
 	foreach(lc, rel->indexlist)
 	{
diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
index 2d49e765de..c1619754a7 100644
--- a/src/test/regress/expected/inherit.out
+++ b/src/test/regress/expected/inherit.out
@@ -721,6 +721,12 @@ select * from d;
  32 | one | two | three
 (1 row)
 
+DROP TABLE a CASCADE;
+NOTICE:  drop cascades to 4 other objects
+DETAIL:  drop cascades to table b
+drop cascades to table c
+drop cascades to table d
+drop cascades to table z
 -- The above verified that we can change the type of a multiply-inherited
 -- column; but we should reject that if any definition was inherited from
 -- an unrelated parent.
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 2ed2e542a4..a5f1aa577b 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -4588,7 +4588,8 @@ select a.q2, b.q1
 reset enable_hashjoin;
 reset enable_nestloop;
 --
--- test join removal
+-- test join removal for plain tables
+-- we have almost the smare tests for partitioned tables below
 --
 begin;
 CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int);
@@ -4722,6 +4723,127 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
          Filter: (a.id = i)
 (4 rows)
 
+rollback;
+--
+-- test join removal for partitioned tables
+-- this is analog to the non partitioned case above
+--
+begin;
+CREATE TABLE a (id int PRIMARY KEY, b_id int) partition by range(id);
+CREATE TABLE a_m partition of a for values from (0) to (10) partition by range(id);
+CREATE TABLE a_c partition of a_m for values from (0) to (10);
+CREATE TABLE b (id int PRIMARY KEY, c_id int) partition by range(id);
+CREATE TABLE b_m partition of b for values from (0) to (10) partition by range(id);
+CREATE TABLE b_c partition of b_m for values from (0) to (10);
+CREATE TABLE c (id int PRIMARY KEY) partition by range(id);
+CREATE TABLE c_m partition of c for values from (0) to (10) partition by range(id);
+CREATE TABLE c_c partition of c_m for values from (0) to (10);
+CREATE TABLE d (a int, b int) partition by range(a);
+CREATE TABLE d_c partition of d for values from (0) to (10);
+INSERT INTO a VALUES (0, 0), (1, NULL);
+INSERT INTO b VALUES (0, 0), (1, NULL);
+INSERT INTO c VALUES (0), (1);
+INSERT INTO d VALUES (1,3), (2,2), (3,1);
+-- all three cases should be optimizable into a simple seqscan
+explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id;
+    QUERY PLAN     
+-------------------
+ Seq Scan on a_c a
+(1 row)
+
+explain (costs off) SELECT b.* FROM b LEFT JOIN c ON b.c_id = c.id;
+    QUERY PLAN     
+-------------------
+ Seq Scan on b_c b
+(1 row)
+
+explain (costs off)
+  SELECT a.* FROM a LEFT JOIN (b left join c on b.c_id = c.id)
+  ON (a.b_id = b.id);
+    QUERY PLAN     
+-------------------
+ Seq Scan on a_c a
+(1 row)
+
+-- check optimization of outer join within another special join
+explain (costs off)
+select id from a where id in (
+	select b.id from b left join c on b.id = c.id
+);
+          QUERY PLAN           
+-------------------------------
+ Hash Join
+   Hash Cond: (a.id = b.id)
+   ->  Seq Scan on a_c a
+   ->  Hash
+         ->  Seq Scan on b_c b
+(5 rows)
+
+-- check that join removal works for a left join when joining a subquery
+-- that is guaranteed to be unique by its GROUP BY clause
+explain (costs off)
+select d.* from d left join (select * from b group by b.id, b.c_id) s
+  on d.a = s.id and d.b = s.c_id;
+    QUERY PLAN     
+-------------------
+ Seq Scan on d_c d
+(1 row)
+
+-- similarly, but keying off a DISTINCT clause
+explain (costs off)
+select d.* from d left join (select distinct * from b) s
+  on d.a = s.id and d.b = s.c_id;
+    QUERY PLAN     
+-------------------
+ Seq Scan on d_c d
+(1 row)
+
+-- join removal is not possible when the GROUP BY contains a column that is
+-- not in the join condition.  See above for further notes about this.
+explain (costs off)
+select d.* from d left join (select * from b group by b.id, b.c_id) s
+  on d.a = s.id;
+                   QUERY PLAN                   
+------------------------------------------------
+ Merge Right Join
+   Merge Cond: (b.id = d.a)
+   ->  Group
+         Group Key: b.id
+         ->  Index Scan using b_c_pkey on b_c b
+   ->  Sort
+         Sort Key: d.a
+         ->  Seq Scan on d_c d
+(8 rows)
+
+-- similarly, but keying off a DISTINCT clause
+explain (costs off)
+select d.* from d left join (select distinct * from b) s
+  on d.a = s.id;
+                 QUERY PLAN                  
+---------------------------------------------
+ Merge Left Join
+   Merge Cond: (d.a = s.id)
+   ->  Sort
+         Sort Key: d.a
+         ->  Seq Scan on d_c d
+   ->  Sort
+         Sort Key: s.id
+         ->  Subquery Scan on s
+               ->  HashAggregate
+                     Group Key: b.id, b.c_id
+                     ->  Seq Scan on b_c b
+(11 rows)
+
+-- check join removal works when uniqueness of the join condition is enforced
+-- by a UNION
+explain (costs off)
+select d.* from d left join (select id from a union select id from b) s
+  on d.a = s.id;
+    QUERY PLAN     
+-------------------
+ Seq Scan on d_c d
+(1 row)
+
 rollback;
 create temp table parent (k int primary key, pd int);
 create temp table child (k int unique, cd int);
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index 03926a8413..c854228482 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -4852,46 +4852,46 @@ SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON (t1.a = t2.a AND t1.b = t2
 (8 rows)
 
 -- partitionwise join with fractional paths
-CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
+CREATE TABLE fract_t (id BIGINT, c INT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
 -- insert data
-INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
+INSERT INTO fract_t (id,c) SELECT i,i FROM generate_series(0, 1999) i;
 ANALYZE fract_t;
--- verify plan; nested index only scans
+-- verify plan; nested index scans
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 EXPLAIN (COSTS OFF)
 SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
-                              QUERY PLAN                               
------------------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Limit
    ->  Merge Append
          Sort Key: x.id
          ->  Merge Left Join
                Merge Cond: (x_1.id = y_1.id)
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 x_1
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
+               ->  Index Scan using fract_t0_pkey on fract_t0 x_1
+               ->  Index Scan using fract_t0_pkey on fract_t0 y_1
          ->  Merge Left Join
                Merge Cond: (x_2.id = y_2.id)
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 x_2
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
+               ->  Index Scan using fract_t1_pkey on fract_t1 x_2
+               ->  Index Scan using fract_t1_pkey on fract_t1 y_2
 (11 rows)
 
 EXPLAIN (COSTS OFF)
 SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
-                                   QUERY PLAN                                   
---------------------------------------------------------------------------------
+                                QUERY PLAN                                 
+---------------------------------------------------------------------------
  Limit
    ->  Merge Append
          Sort Key: x.id DESC
          ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t0_pkey on fract_t0 x_1
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
+               ->  Index Scan Backward using fract_t0_pkey on fract_t0 x_1
+               ->  Index Scan using fract_t0_pkey on fract_t0 y_1
                      Index Cond: (id = x_1.id)
          ->  Nested Loop Left Join
-               ->  Index Only Scan Backward using fract_t1_pkey on fract_t1 x_2
-               ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
+               ->  Index Scan Backward using fract_t1_pkey on fract_t1 x_2
+               ->  Index Scan using fract_t1_pkey on fract_t1 y_2
                      Index Cond: (id = x_2.id)
 (11 rows)
 
diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
index 195aedb5ff..ee55fc1ec8 100644
--- a/src/test/regress/sql/inherit.sql
+++ b/src/test/regress/sql/inherit.sql
@@ -208,6 +208,8 @@ insert into d values('test','one','two','three');
 alter table a alter column aa type integer using bit_length(aa);
 select * from d;
 
+DROP TABLE a CASCADE;
+
 -- The above verified that we can change the type of a multiply-inherited
 -- column; but we should reject that if any definition was inherited from
 -- an unrelated parent.
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 27e7e741a1..df7fad22bc 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1579,7 +1579,8 @@ reset enable_hashjoin;
 reset enable_nestloop;
 
 --
--- test join removal
+-- test join removal for plain tables
+-- we have almost the smare tests for partitioned tables below
 --
 
 begin;
@@ -1648,6 +1649,72 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
 
 rollback;
 
+--
+-- test join removal for partitioned tables
+-- this is analog to the non partitioned case above
+--
+
+begin;
+
+CREATE TABLE a (id int PRIMARY KEY, b_id int) partition by range(id);
+CREATE TABLE a_m partition of a for values from (0) to (10) partition by range(id);
+CREATE TABLE a_c partition of a_m for values from (0) to (10);
+CREATE TABLE b (id int PRIMARY KEY, c_id int) partition by range(id);
+CREATE TABLE b_m partition of b for values from (0) to (10) partition by range(id);
+CREATE TABLE b_c partition of b_m for values from (0) to (10);
+CREATE TABLE c (id int PRIMARY KEY) partition by range(id);
+CREATE TABLE c_m partition of c for values from (0) to (10) partition by range(id);
+CREATE TABLE c_c partition of c_m for values from (0) to (10);
+CREATE TABLE d (a int, b int) partition by range(a);
+CREATE TABLE d_c partition of d for values from (0) to (10);
+INSERT INTO a VALUES (0, 0), (1, NULL);
+INSERT INTO b VALUES (0, 0), (1, NULL);
+INSERT INTO c VALUES (0), (1);
+INSERT INTO d VALUES (1,3), (2,2), (3,1);
+
+-- all three cases should be optimizable into a simple seqscan
+explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id;
+explain (costs off) SELECT b.* FROM b LEFT JOIN c ON b.c_id = c.id;
+explain (costs off)
+  SELECT a.* FROM a LEFT JOIN (b left join c on b.c_id = c.id)
+  ON (a.b_id = b.id);
+
+-- check optimization of outer join within another special join
+explain (costs off)
+select id from a where id in (
+	select b.id from b left join c on b.id = c.id
+);
+
+-- check that join removal works for a left join when joining a subquery
+-- that is guaranteed to be unique by its GROUP BY clause
+explain (costs off)
+select d.* from d left join (select * from b group by b.id, b.c_id) s
+  on d.a = s.id and d.b = s.c_id;
+
+-- similarly, but keying off a DISTINCT clause
+explain (costs off)
+select d.* from d left join (select distinct * from b) s
+  on d.a = s.id and d.b = s.c_id;
+
+-- join removal is not possible when the GROUP BY contains a column that is
+-- not in the join condition.  See above for further notes about this.
+explain (costs off)
+select d.* from d left join (select * from b group by b.id, b.c_id) s
+  on d.a = s.id;
+
+-- similarly, but keying off a DISTINCT clause
+explain (costs off)
+select d.* from d left join (select distinct * from b) s
+  on d.a = s.id;
+
+-- check join removal works when uniqueness of the join condition is enforced
+-- by a UNION
+explain (costs off)
+select d.* from d left join (select id from a union select id from b) s
+  on d.a = s.id;
+
+rollback;
+
 create temp table parent (k int primary key, pd int);
 create temp table child (k int unique, cd int);
 insert into parent values (1, 10), (2, 20), (3, 30);
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 67f506361f..009184d348 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -1144,15 +1144,15 @@ SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON (t1.a = t2.a AND t1.b = t2
 SELECT t1.*, t2.* FROM alpha t1 INNER JOIN beta t2 ON (t1.a = t2.a AND t1.b = t2.b AND t1.c = t2.c) WHERE ((t1.b >= 100 AND t1.b < 110) OR (t1.b >= 200 AND t1.b < 210)) AND ((t2.b >= 100 AND t2.b < 110) OR (t2.b >= 200 AND t2.b < 210)) AND t1.c IN ('0004', '0009') ORDER BY t1.a, t1.b;
 
 -- partitionwise join with fractional paths
-CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
+CREATE TABLE fract_t (id BIGINT, c INT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
 
 -- insert data
-INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
+INSERT INTO fract_t (id,c) SELECT i,i FROM generate_series(0, 1999) i;
 ANALYZE fract_t;
 
--- verify plan; nested index only scans
+-- verify plan; nested index scans
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 
-- 
2.35.3

#26

dgrowleyml@gmail.com

over 3 years ago

In reply to: Arne Roland (#25)

Re: missing indexes in indexlist with partitioned tables

On Sun, 2 Oct 2022 at 05:34, Arne Roland <A.Roland@index.de> wrote:

On Tue, Sep 20, 2022 at 4:53 PM David Rowley <dgrowleyml@gmail.com> wrote:

Arne sent me an off-list
message to say he's planning on working on the patch that uses the
existing field instead of the new one he originally added. Let's hold
off for that patch.

I wouldn't say, I explicitly stated that. But I ended up doing something, that resulted in the attached patch. :)

I stand corrected. You said you'd think about it, not do it. Anyway,
thanks for doing it :)

Further feedback would be appreciated!

I had a quick look through the patch. Here are a few things that would
be good to adjust. I understand that some of these things were how I
left them in the patch I sent. In my defence, I mostly did that very
quickly just so I could see if there was some issue with having the
partitioned indexes in indexlist. I didn't actually put much effort
into addressing the fine details of how that should be done.

* In the header comment in get_relation_info(), I don't think we need
to mention join removals explicitly. At a stretch, maybe mentioning
"unique proofs" might be ok, but "various optimizations" might be
better. If you mention "join removal", I fear that will just become
outdated too quickly as further optimisations are added. Likewise for
the comment about "join pruning" you've added in the function body.
FWIW, we call these "join removals" anyway.

* I think we should put RelationGetNumberOfBlocks() back to what it
was and just ensure we don't call that for partitioned indexes in
get_relation_info(). (Yes, I know that was my change)

* I can't quite figure out why you're doing "DROP TABLE a CASCADE;" in
inherits.sql. You've not changed anything else in that file. Did you
mean to do this in join.sql?

* The new test in join.sql. I understand that you've mostly copied the
test from another place in the file and adjusted it to use a
partitioned table. However, I don't really think you need to INSERT
any data into those tables. I also think that using the table name of
"a" is dangerous as it could conflict with another table by the same
name in a parallel run of the tests. The non-partitioned version is a
TEMP table. Also, it's slightly painful to look at the inconsistent
capitalisation of SQL keywords in those queries you've added, again, I
understand those are copied from above, but I see no need to duplicate
the inconsistencies. Perhaps it's fine to copy the upper case
keywords in the DDL and keep all lowercase in the queries. Also, I
think you probably should just add a single simple join removal test
for partitioned tables. You're not exercising any code that the
non-partitioned case isn't by adding any additional tests. All that I
think is worth testing here is ensuring nobody thinks that partitioned
tables can get away with an empty indexlist again.

* I had a bit of a 2nd thought on the test change in
partition_join.sql. I know I added the "c" column so that join
removals didn't apply. I'm now thinking it's probably better to just
change the queries to use JOIN ON rather than JOIN USING. Also, apply
the correct alias to the ORDER BY. This method should save from
slowing the test down due to the additional column. We have some
pretty slow buildfarm machines that this might actually make a
meaningful difference to.

Thanks again for working on this.

David

#27

Tom Lane

tgl@sss.pgh.pa.us

over 3 years ago

In reply to: David Rowley (#26)

Re: missing indexes in indexlist with partitioned tables

David Rowley <dgrowleyml@gmail.com> writes:

* I can't quite figure out why you're doing "DROP TABLE a CASCADE;" in
inherits.sql. You've not changed anything else in that file. Did you
mean to do this in join.sql?

Doing that would be a bad idea no matter where it's done. IIRC,
those tables are intentionally set up to stress later dump/restore
tests (with issues like inheritance children having column order
different from their parents).

regards, tom lane

#28

https://api.cirrus-ci.com/v1/task/6190372803051520/logs/cores.log

A.Roland@index.de

about 3 years ago

In reply to: David Rowley (#26)

1 attachment(s)

Re: missing indexes in indexlist with partitioned tables

Hi!

I mainly changed the comments, the Assert and some casing.

From: David Rowley <dgrowleyml@gmail.com>
Sent: Monday, October 3, 2022 00:51

* In the header comment in get_relation_info(), I don't think we need
to mention join removals explicitly. At a stretch, maybe mentioning
"unique proofs" might be ok, but "various optimizations" might be
better. If you mention "join removal", I fear that will just become
outdated too quickly as further optimisations are added. Likewise for
the comment about "join pruning" you've added in the function body.
FWIW, we call these "join removals" anyway.

I made them a little bit more vague. I also updated the description of indexlist in general.

* I think we should put RelationGetNumberOfBlocks() back to what it
was and just ensure we don't call that for partitioned indexes in
get_relation_info(). (Yes, I know that was my change)

I don't think it's relevant who did it. I don't see much importance either way. I reverted it to the old state.

* I can't quite figure out why you're doing "DROP TABLE a CASCADE;" in
inherits.sql. You've not changed anything else in that file. Did you
mean to do this in join.sql?

The problem I encountered, was that simple copy of the test wasn't possible, because the tables were named the same way. It seemed intuitive to me to make the tests , such that there are no side-effects. I added a comment to the creation of those tables to make clear, that there are intended side effects by not dropping those tables.

* The new test in join.sql. I understand that you've mostly copied the
test from another place in the file and adjusted it to use a
partitioned table. However, I don't really think you need to INSERT
any data into those tables. I also think that using the table name of
"a" is dangerous as it could conflict with another table by the same
name in a parallel run of the tests. The non-partitioned version is a
TEMP table. Also, it's slightly painful to look at the inconsistent
capitalisation of SQL keywords in those queries you've added, again, I
understand those are copied from above, but I see no need to duplicate
the inconsistencies. Perhaps it's fine to copy the upper case
keywords in the DDL and keep all lowercase in the queries. Also, I
think you probably should just add a single simple join removal test
for partitioned tables. You're not exercising any code that the
non-partitioned case isn't by adding any additional tests. All that I
think is worth testing here is ensuring nobody thinks that partitioned
tables can get away with an empty indexlist again.

I am not sure, how thorough the tests on partitioned tables need to be. I guess, I will turn up more issues in production, than any test will be able to cover.

As a general sentiment, I wouldn't agree. The empty indexlist isn't the only interesting thing to test. The more we add optimizations, the more non trivial intersections of those start to break things again, we have fixed. A notable part of the complexity of the optimizer stems from the fact, that we apply most transformations in a fixed order. We obviously have to do that for performance reasons. But as long as we have that, we are prone to have cases where the ordering breaks part. Partitioned tables are a prominent cases here, because we always have the appendrel.

I removed some test cases here to half the amount of partitioned tables needed here. I don't see the value in having one simple explain less. But I do not have strong feelings about this. Are there any further opinions?

* I had a bit of a 2nd thought on the test change in
partition_join.sql. I know I added the "c" column so that join
removals didn't apply. I'm now thinking it's probably better to just
change the queries to use JOIN ON rather than JOIN USING. Also, apply
the correct alias to the ORDER BY. This method should save from
slowing the test down due to the additional column. We have some
pretty slow buildfarm machines that this might actually make a
meaningful difference to.

There is no real point in changing this, because we can just access the column that is hidden anyways to make the join removal impossible.
That is sort of the tick my version v6 was going for. Tbh I don't care much either way as long as the test still tests for the fractional merge append. I just switched back.

Afaiac creating and dropping a table is sort of the worst thing we can do, when thinking about tests. There is just so much work involved there. If I am concerned about test runtimes, I'd try to minimize that. This is even more true regarding tests for partitioned tables. To get a parted table with two partitions, we always have to create three tables. I do think there is a lot of potential to speed up the test times with that, but I'd suggest to handle that in a different thread.

Thanks again for working on this.

Thank you for your input!

Arne

Attachments:

v9_indexlist_contains_partitioned_indexes-0001-indexlist.patchtext/x-patch; name=v9_indexlist_contains_partitioned_indexes-0001-indexlist.patchDownload

From 0f8f13e1fcc3c6c8f0cb812801d21547666a2229 Mon Sep 17 00:00:00 2001
From: Arne <aro@index.de>
Date: Mon, 31 Oct 2022 20:24:00 +0100
Subject: [PATCH v9_indexlist_contains_partitioned_indexes] indexlist

---
 src/backend/optimizer/util/plancat.c         | 226 ++++++++++---------
 src/backend/utils/adt/selfuncs.c             |   4 +
 src/include/nodes/pathnodes.h                |  11 +-
 src/test/regress/expected/inherit.out        |   2 +
 src/test/regress/expected/join.out           |  52 ++++-
 src/test/regress/expected/partition_join.out |   6 +-
 src/test/regress/sql/inherit.sql             |   4 +
 src/test/regress/sql/join.sql                |  34 ++-
 src/test/regress/sql/partition_join.sql      |   6 +-
 9 files changed, 231 insertions(+), 114 deletions(-)

diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 9defe37836..f231c5abbb 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -106,10 +106,12 @@ static void set_baserel_partition_constraint(Relation relation,
  * cases these are left as zeroes, but sometimes we need to compute attr
  * widths here, and we may as well cache the results for costsize.c.
  *
- * If inhparent is true, all we need to do is set up the attr arrays:
- * the RelOptInfo actually represents the appendrel formed by an inheritance
- * tree, and so the parent rel's physical size and index information isn't
- * important for it.
+ * If inhparent is true, we don't care about physical size, index am
+ * and estimates.
+ * We still need index uniqueness for join removal.
+ * When it comes to the path, the RelOptInfo actually represents
+ * the appendrel formed by an inheritance tree, and so the parent
+ * rel's physical size isn't important for it.
  */
 void
 get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
@@ -175,10 +177,12 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
 	/*
 	 * Make list of indexes.  Ignore indexes on system catalogs if told to.
-	 * Don't bother with indexes for an inheritance parent, either.
+	 * Don't bother with indexes from traditional inheritance parents.
+	 * We care about partitioned indexes for further optimisations, like join
+	 * removal, even if we don't have any am for them.
 	 */
-	if (inhparent ||
-		(IgnoreSystemIndexes && IsSystemRelation(relation)))
+	if ((inhparent && relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		|| (IgnoreSystemIndexes && IsSystemRelation(relation)))
 		hasindex = false;
 	else
 		hasindex = relation->rd_rel->relhasindex;
@@ -209,8 +213,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			IndexAmRoutine *amroutine;
 			IndexOptInfo *info;
 			int			ncolumns,
-						nkeycolumns;
-			int			i;
+						nkeycolumns,
+						i;
 
 			/*
 			 * Extract info from the relation descriptor for the index.
@@ -231,16 +235,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				continue;
 			}
 
-			/*
-			 * Ignore partitioned indexes, since they are not usable for
-			 * queries.
-			 */
-			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
-			{
-				index_close(indexRelation, NoLock);
-				continue;
-			}
-
 			/*
 			 * If the index is valid, but cannot yet be used, ignore it; but
 			 * mark the plan we are generating as transient. See
@@ -285,108 +279,133 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
 			info->relam = indexRelation->rd_rel->relam;
 
-			/* We copy just the fields we need, not all of rd_indam */
-			amroutine = indexRelation->rd_indam;
-			info->amcanorderbyop = amroutine->amcanorderbyop;
-			info->amoptionalkey = amroutine->amoptionalkey;
-			info->amsearcharray = amroutine->amsearcharray;
-			info->amsearchnulls = amroutine->amsearchnulls;
-			info->amcanparallel = amroutine->amcanparallel;
-			info->amhasgettuple = (amroutine->amgettuple != NULL);
-			info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
-				relation->rd_tableam->scan_bitmap_next_block != NULL;
-			info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
-								  amroutine->amrestrpos != NULL);
-			info->amcostestimate = amroutine->amcostestimate;
-			Assert(info->amcostestimate != NULL);
-
-			/* Fetch index opclass options */
-			info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
-
 			/*
-			 * Fetch the ordering information for the index, if any.
+			 * We don't have am for partitioned indexes, therefore we skip
+			 * gathering the info. This is ok, because we don't use
+			 * partitioned indexes in paths. We resolve inheritance before
+			 * generating paths.
 			 */
-			if (info->relam == BTREE_AM_OID)
+			if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
 			{
+				/* We copy just the fields we need, not all of rd_indam */
+				amroutine = indexRelation->rd_indam;
+				info->amcanorderbyop = amroutine->amcanorderbyop;
+				info->amoptionalkey = amroutine->amoptionalkey;
+				info->amsearcharray = amroutine->amsearcharray;
+				info->amsearchnulls = amroutine->amsearchnulls;
+				info->amcanparallel = amroutine->amcanparallel;
+				info->amhasgettuple = (amroutine->amgettuple != NULL);
+				info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
+					relation->rd_tableam->scan_bitmap_next_block != NULL;
+				info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
+					amroutine->amrestrpos != NULL);
+				info->amcostestimate = amroutine->amcostestimate;
+				Assert(info->amcostestimate != NULL);
+
+				/* Fetch index opclass options */
+				info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
+
 				/*
-				 * If it's a btree index, we can use its opfamily OIDs
-				 * directly as the sort ordering opfamily OIDs.
+				 * Fetch the ordering information for the index, if any.
 				 */
-				Assert(amroutine->amcanorder);
-
-				info->sortopfamily = info->opfamily;
-				info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-				info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
-
-				for (i = 0; i < nkeycolumns; i++)
+				if (info->relam == BTREE_AM_OID)
 				{
-					int16		opt = indexRelation->rd_indoption[i];
+					/*
+					 * If it's a btree index, we can use its opfamily OIDs
+					 * directly as the sort ordering opfamily OIDs.
+					 */
+					Assert(amroutine->amcanorder);
 
-					info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-					info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-				}
-			}
-			else if (amroutine->amcanorder)
-			{
-				/*
-				 * Otherwise, identify the corresponding btree opfamilies by
-				 * trying to map this index's "<" operators into btree.  Since
-				 * "<" uniquely defines the behavior of a sort order, this is
-				 * a sufficient test.
-				 *
-				 * XXX This method is rather slow and also requires the
-				 * undesirable assumption that the other index AM numbers its
-				 * strategies the same as btree.  It'd be better to have a way
-				 * to explicitly declare the corresponding btree opfamily for
-				 * each opfamily of the other index type.  But given the lack
-				 * of current or foreseeable amcanorder index types, it's not
-				 * worth expending more effort on now.
-				 */
-				info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
-				info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-				info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
+					info->sortopfamily = info->opfamily;
+					info->reverse_sort = (bool *)palloc(sizeof(bool) * nkeycolumns);
+					info->nulls_first = (bool *)palloc(sizeof(bool) * nkeycolumns);
 
-				for (i = 0; i < nkeycolumns; i++)
-				{
-					int16		opt = indexRelation->rd_indoption[i];
-					Oid			ltopr;
-					Oid			btopfamily;
-					Oid			btopcintype;
-					int16		btstrategy;
-
-					info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-					info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-
-					ltopr = get_opfamily_member(info->opfamily[i],
-												info->opcintype[i],
-												info->opcintype[i],
-												BTLessStrategyNumber);
-					if (OidIsValid(ltopr) &&
-						get_ordering_op_properties(ltopr,
-												   &btopfamily,
-												   &btopcintype,
-												   &btstrategy) &&
-						btopcintype == info->opcintype[i] &&
-						btstrategy == BTLessStrategyNumber)
+					for (i = 0; i < nkeycolumns; i++)
 					{
-						/* Successful mapping */
-						info->sortopfamily[i] = btopfamily;
+						int16		opt = indexRelation->rd_indoption[i];
+
+						info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+						info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
 					}
-					else
+				}
+				else if (amroutine->amcanorder)
+				{
+					/*
+					 * Otherwise, identify the corresponding btree opfamilies by
+					 * trying to map this index's "<" operators into btree.  Since
+					 * "<" uniquely defines the behavior of a sort order, this is
+					 * a sufficient test.
+					 *
+					 * XXX This method is rather slow and also requires the
+					 * undesirable assumption that the other index AM numbers its
+					 * strategies the same as btree.  It'd be better to have a way
+					 * to explicitly declare the corresponding btree opfamily for
+					 * each opfamily of the other index type.  But given the lack
+					 * of current or foreseeable amcanorder index types, it's not
+					 * worth expending more effort on now.
+					 */
+					info->sortopfamily = (Oid *)palloc(sizeof(Oid) * nkeycolumns);
+					info->reverse_sort = (bool *)palloc(sizeof(bool) * nkeycolumns);
+					info->nulls_first = (bool *)palloc(sizeof(bool) * nkeycolumns);
+
+					for (i = 0; i < nkeycolumns; i++)
 					{
-						/* Fail ... quietly treat index as unordered */
-						info->sortopfamily = NULL;
-						info->reverse_sort = NULL;
-						info->nulls_first = NULL;
-						break;
+						int16		opt = indexRelation->rd_indoption[i];
+						Oid			ltopr;
+						Oid			btopfamily;
+						Oid			btopcintype;
+						int16		btstrategy;
+
+						info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+						info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
+
+						ltopr = get_opfamily_member(info->opfamily[i],
+							info->opcintype[i],
+							info->opcintype[i],
+							BTLessStrategyNumber);
+						if (OidIsValid(ltopr) &&
+							get_ordering_op_properties(ltopr,
+								&btopfamily,
+								&btopcintype,
+								&btstrategy) &&
+							btopcintype == info->opcintype[i] &&
+							btstrategy == BTLessStrategyNumber)
+						{
+							/* Successful mapping */
+							info->sortopfamily[i] = btopfamily;
+						}
+						else
+						{
+							/* Fail ... quietly treat index as unordered */
+							info->sortopfamily = NULL;
+							info->reverse_sort = NULL;
+							info->nulls_first = NULL;
+							break;
+						}
 					}
 				}
+				else
+				{
+					info->sortopfamily = NULL;
+					info->reverse_sort = NULL;
+					info->nulls_first = NULL;
+				}
 			}
 			else
 			{
 				info->sortopfamily = NULL;
 				info->reverse_sort = NULL;
 				info->nulls_first = NULL;
+
+				info->amcanorderbyop = false;
+				info->amoptionalkey = false;
+				info->amsearcharray = false;
+				info->amsearchnulls = false;
+				info->amcanparallel = false;
+				info->amhasgettuple = false;
+				info->amhasgetbitmap = false;
+				info->amcanmarkpos = false;
+				info->amcostestimate = NULL;
 			}
 
 			/*
@@ -433,7 +452,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 					info->tuples = rel->tuples;
 			}
 
-			if (info->relam == BTREE_AM_OID)
+			if (info->relam == BTREE_AM_OID &&
+				indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
 			{
 				/* For btrees, get tree height while we have the index open */
 				info->tree_height = _bt_getrootheight(indexRelation);
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 69e0fb98f5..2b543b7199 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -5963,6 +5963,10 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
 	rte = root->simple_rte_array[rel->relid];
 	Assert(rte->rtekind == RTE_RELATION);
 
+	/* ignore partitioned tables.  Any indexes here are not real indexes */
+	if (rte->relkind == RELKIND_PARTITIONED_TABLE)
+		return false;
+
 	/* Search through the indexes to see if any match our problem */
 	foreach(lc, rel->indexlist)
 	{
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 09342d128d..a6ec525da7 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -628,7 +628,7 @@ typedef struct PartitionSchemeData *PartitionScheme;
  *		lateral_relids - required outer rels for LATERAL, as a Relids set
  *			(includes both direct and indirect lateral references)
  *
- * If the relation is a base relation it will have these fields set:
+ * If the relation is a base relation, it will have these fields set:
  *
  *		relid - RTE index (this is redundant with the relids field, but
  *				is provided for convenience of access)
@@ -643,8 +643,6 @@ typedef struct PartitionSchemeData *PartitionScheme;
  *					   Vars and PlaceHolderVars)
  *		lateral_referencers - relids of rels that reference this one laterally
  *				(includes both direct and indirect lateral references)
- *		indexlist - list of IndexOptInfo nodes for relation's indexes
- *					(always NIL if it's not a table)
  *		pages - number of disk pages in relation (zero if not a table)
  *		tuples - number of tuples in relation (not considering restrictions)
  *		allvisfrac - fraction of disk pages that are marked all-visible
@@ -660,6 +658,13 @@ typedef struct PartitionSchemeData *PartitionScheme;
  *		For otherrels that are appendrel members, these fields are filled
  *		in just as for a baserel, except we don't bother with lateral_vars.
  *
+ * If the relation is either a base relation or a partitioned table, it will have a set:
+ *		indexlist - list of IndexOptInfo nodes for relation's indexes
+ *		                        (for patitioned indexes not all of the
+ *		                            IndexOptInfo fields are set, since they
+ *		                            can't be accessed)
+ *					(always NIL if it's not a table)
+ *
  * If the relation is either a foreign table or a join of foreign tables that
  * all belong to the same foreign server and are assigned to the same user to
  * check access permissions as (cf checkAsUser), these fields will be set:
diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
index 2d49e765de..80fc76ed32 100644
--- a/src/test/regress/expected/inherit.out
+++ b/src/test/regress/expected/inherit.out
@@ -1,6 +1,8 @@
 --
 -- Test inheritance features
 --
+-- XXX These tables are not dropped. Why?
+-- XXX Are they needed for later tests? For dump-restore? Dropping them doesn't fail any test from check-world.
 CREATE TABLE a (aa TEXT);
 CREATE TABLE b (bb TEXT) INHERITS (a);
 CREATE TABLE c (cc TEXT) INHERITS (a);
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index 9b69a8c122..ff23808013 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -4588,7 +4588,8 @@ select a.q2, b.q1
 reset enable_hashjoin;
 reset enable_nestloop;
 --
--- test join removal
+-- test join removal for plain tables
+-- we have almost the smare tests for partitioned tables below
 --
 begin;
 CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int);
@@ -4720,6 +4721,55 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
          Filter: (a.id = i)
 (4 rows)
 
+rollback;
+--
+-- test join removal for partitioned tables
+-- this is analog to the non partitioned case above
+--
+begin;
+CREATE TABLE b_p (id int PRIMARY KEY, c_id int) partition by range(id);
+CREATE TABLE b_m partition of b_p for values from (0) to (10) partition by range(id);
+CREATE TABLE b_c partition of b_m for values from (0) to (10);
+CREATE TABLE c_p (id int PRIMARY KEY) partition by range(id);
+CREATE TABLE c_m partition of c_p for values from (0) to (10) partition by range(id);
+CREATE TABLE c_c partition of c_m for values from (0) to (10);
+INSERT INTO b_p VALUES (0, 0), (1, NULL);
+INSERT INTO c_p VALUES (0), (1);
+-- all three cases should be optimizable into a simple seqscan
+EXPLAIN (COSTS OFF) SELECT b0.* FROM b_p b0 LEFT JOIN b_p b ON b0.c_id = b.id;
+     QUERY PLAN     
+--------------------
+ Seq Scan on b_c b0
+(1 row)
+
+EXPLAIN (COSTS OFF) SELECT b.* FROM b_p b LEFT JOIN c_p c ON b.c_id = c.id;
+    QUERY PLAN     
+-------------------
+ Seq Scan on b_c b
+(1 row)
+
+EXPLAIN (COSTS OFF)
+  SELECT b0.* FROM b_p b0 LEFT JOIN (b_p b LEFT JOIN c_p c on b.c_id = c.id)
+  ON (b0.c_id = b.id);
+     QUERY PLAN     
+--------------------
+ Seq Scan on b_c b0
+(1 row)
+
+-- check optimization of outer join within another special join
+EXPLAIN (COSTS OFF)
+select id from b_p b0 WHERE id in (
+	SELECT b.id FROM b_p b LEFT JOIN c_p c on b.id = c.id
+);
+          QUERY PLAN           
+-------------------------------
+ Hash Join
+   Hash Cond: (b0.id = b.id)
+   ->  Seq Scan on b_c b0
+   ->  Hash
+         ->  Seq Scan on b_c b
+(5 rows)
+
 rollback;
 create temp table parent (k int primary key, pd int);
 create temp table child (k int unique, cd int);
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index b20facc19f..dd8ad1c71a 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -4867,13 +4867,13 @@ CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
 -- insert data
-INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
+INSERT INTO fract_t (id) SELECT i FROM generate_series(0, 1999) i;
 ANALYZE fract_t;
 -- verify plan; nested index only scans
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
                               QUERY PLAN                               
 -----------------------------------------------------------------------
  Limit
@@ -4890,7 +4890,7 @@ SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
 (11 rows)
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
                                    QUERY PLAN                                   
 --------------------------------------------------------------------------------
  Limit
diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
index 195aedb5ff..69600bb068 100644
--- a/src/test/regress/sql/inherit.sql
+++ b/src/test/regress/sql/inherit.sql
@@ -1,6 +1,10 @@
 --
 -- Test inheritance features
 --
+
+-- XXX These tables are not dropped. Why?
+-- XXX Are they needed for later tests? For dump-restore? Dropping them doesn't fail any test from check-world.
+
 CREATE TABLE a (aa TEXT);
 CREATE TABLE b (bb TEXT) INHERITS (a);
 CREATE TABLE c (cc TEXT) INHERITS (a);
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 27e7e741a1..24a18d83b8 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1579,7 +1579,8 @@ reset enable_hashjoin;
 reset enable_nestloop;
 
 --
--- test join removal
+-- test join removal for plain tables
+-- we have almost the smare tests for partitioned tables below
 --
 
 begin;
@@ -1648,6 +1649,37 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
 
 rollback;
 
+--
+-- test join removal for partitioned tables
+-- this is analog to the non partitioned case above
+--
+
+begin;
+
+CREATE TABLE b_p (id int PRIMARY KEY, c_id int) partition by range(id);
+CREATE TABLE b_m partition of b_p for values from (0) to (10) partition by range(id);
+CREATE TABLE b_c partition of b_m for values from (0) to (10);
+CREATE TABLE c_p (id int PRIMARY KEY) partition by range(id);
+CREATE TABLE c_m partition of c_p for values from (0) to (10) partition by range(id);
+CREATE TABLE c_c partition of c_m for values from (0) to (10);
+INSERT INTO b_p VALUES (0, 0), (1, NULL);
+INSERT INTO c_p VALUES (0), (1);
+
+-- all three cases should be optimizable into a simple seqscan
+EXPLAIN (COSTS OFF) SELECT b0.* FROM b_p b0 LEFT JOIN b_p b ON b0.c_id = b.id;
+EXPLAIN (COSTS OFF) SELECT b.* FROM b_p b LEFT JOIN c_p c ON b.c_id = c.id;
+EXPLAIN (COSTS OFF)
+  SELECT b0.* FROM b_p b0 LEFT JOIN (b_p b LEFT JOIN c_p c on b.c_id = c.id)
+  ON (b0.c_id = b.id);
+
+-- check optimization of outer join within another special join
+EXPLAIN (COSTS OFF)
+select id from b_p b0 WHERE id in (
+	SELECT b.id FROM b_p b LEFT JOIN c_p c on b.id = c.id
+);
+
+rollback;
+
 create temp table parent (k int primary key, pd int);
 create temp table child (k int unique, cd int);
 insert into parent values (1, 10), (2, 20), (3, 30);
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 67f506361f..94f48a7c51 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -1149,7 +1149,7 @@ CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
 
 -- insert data
-INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
+INSERT INTO fract_t (id) SELECT i FROM generate_series(0, 1999) i;
 ANALYZE fract_t;
 
 -- verify plan; nested index only scans
@@ -1157,10 +1157,10 @@ SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
 
 -- cleanup
 DROP TABLE fract_t;
-- 
2.35.3

#29

Andres Freund

andres@anarazel.de

about 3 years ago

In reply to: Arne Roland (#28)

Re: missing indexes in indexlist with partitioned tables

Hi,

On 2022-11-02 01:50:38 +0000, Arne Roland wrote:

I mainly changed the comments, the Assert and some casing.

The tests have been failing for a while
https://cirrus-ci.com/github/postgresql-cfbot/postgresql/commitfest/40/3452

#2 0x00005645dff192f6 in ExceptionalCondition (conditionName=conditionName@entry=0x5645e014b167 "false", fileName=fileName@entry=0x5645e0196b08 "../src/backend/storage/buffer/bufmgr.c", lineNumber=lineNumber@entry=2971) at ../src/backend/utils/error/assert.c:66
No locals.
#3 0x00005645dfc13823 in RelationGetNumberOfBlocksInFork (relation=relation@entry=0x7fb54d54e470, forkNum=forkNum@entry=MAIN_FORKNUM) at ../src/backend/storage/buffer/bufmgr.c:2971
No locals.
#4 0x00005645dfa9ac5e in get_relation_info (root=root@entry=0x5645e1ed9840, relationObjectId=16660, inhparent=<optimized out>, rel=rel@entry=0x5645e2086b38) at ../src/backend/optimizer/util/plancat.c:442
indexoid = <optimized out>
info = 0x5645e2083b28
i = <optimized out>
indexRelation = 0x7fb54d54e470
index = 0x7fb54d548c48
amroutine = <optimized out>
ncolumns = 1
nkeycolumns = 1
l__state = {l = <optimized out>, i = <optimized out>}
indexoidlist = 0x5645e2088a98
lmode = 1
l = <optimized out>
varno = 1
relation = 0x7fb54d54e680
hasindex = <optimized out>
indexinfos = 0x0
__func__ = "get_relation_info"
#5 0x00005645dfaa5e25 in build_simple_rel (root=0x5645e1ed9840, relid=1, parent=parent@entry=0x0) at ../src/backend/optimizer/util/relnode.c:293
rel = 0x5645e2086b38
rte = 0x5645e1ed8fc8
__func__ = "build_simple_rel"
...

Greetings,

Andres Freund

#30

https://api.cirrus-ci.com/v1/task/6190372803051520/logs/cores.log

A.Roland@index.de

about 3 years ago

In reply to: Andres Freund (#29)

1 attachment(s)

Re: missing indexes in indexlist with partitioned tables

Thank you!

Sadly I didn't manage how to reproduce that locally. check-world doesn't seem to fail at my end.

That being said, attached patch should fix the issue reported below.

I'll have another look at the log later this week.

Regards

Arne

________________________________
From: Andres Freund <andres@anarazel.de>
Sent: Tuesday, November 22, 2022 2:36:59 AM
To: Arne Roland
Cc: David Rowley; Amit Langote; pgsql-hackers; Zhihong Yu; Alvaro Herrera; Julien Rouhaud
Subject: Re: missing indexes in indexlist with partitioned tables

Hi,

On 2022-11-02 01:50:38 +0000, Arne Roland wrote:

I mainly changed the comments, the Assert and some casing.

The tests have been failing for a while
https://cirrus-ci.com/github/postgresql-cfbot/postgresql/commitfest/40/3452
Cirrus CI<https://cirrus-ci.com/github/postgresql-cfbot/postgresql/commitfest/40/3452>
cirrus-ci.com
Cirrus CI makes your development cycle fast, efficient, and secure by leveraging modern cloud technologies.

Greetings,

Andres Freund

Attachments:

v10-0001-indexlist.patchtext/x-patch; name=v10-0001-indexlist.patchDownload

From e2c0667eb3ad86db2f13369e9692065d57aa7b1c Mon Sep 17 00:00:00 2001
From: Arne Roland <arne.roland@index.de>
Date: Tue, 6 Dec 2022 01:40:21 +0100
Subject: [PATCH v10] v10

---
 src/backend/optimizer/util/plancat.c         | 289 ++++++++++---------
 src/backend/utils/adt/selfuncs.c             |   4 +
 src/include/nodes/pathnodes.h                |  11 +-
 src/test/regress/expected/inherit.out        |   2 +
 src/test/regress/expected/join.out           |  52 +++-
 src/test/regress/expected/partition_join.out |   6 +-
 src/test/regress/sql/inherit.sql             |   4 +
 src/test/regress/sql/join.sql                |  34 ++-
 src/test/regress/sql/partition_join.sql      |   6 +-
 9 files changed, 264 insertions(+), 144 deletions(-)

diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 9defe37836..008bd38785 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -106,10 +106,12 @@ static void set_baserel_partition_constraint(Relation relation,
  * cases these are left as zeroes, but sometimes we need to compute attr
  * widths here, and we may as well cache the results for costsize.c.
  *
- * If inhparent is true, all we need to do is set up the attr arrays:
- * the RelOptInfo actually represents the appendrel formed by an inheritance
- * tree, and so the parent rel's physical size and index information isn't
- * important for it.
+ * If inhparent is true, we don't care about physical size, index am
+ * and estimates.
+ * We still need index uniqueness for join removal.
+ * When it comes to the path, the RelOptInfo actually represents
+ * the appendrel formed by an inheritance tree, and so the parent
+ * rel's physical size isn't important for it.
  */
 void
 get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
@@ -175,10 +177,12 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
 	/*
 	 * Make list of indexes.  Ignore indexes on system catalogs if told to.
-	 * Don't bother with indexes for an inheritance parent, either.
+	 * Don't bother with indexes from traditional inheritance parents.
+	 * We care about partitioned indexes for further optimisations, like join
+	 * removal, even if we don't have any am for them.
 	 */
-	if (inhparent ||
-		(IgnoreSystemIndexes && IsSystemRelation(relation)))
+	if ((inhparent && relation->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		|| (IgnoreSystemIndexes && IsSystemRelation(relation)))
 		hasindex = false;
 	else
 		hasindex = relation->rd_rel->relhasindex;
@@ -209,8 +213,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			IndexAmRoutine *amroutine;
 			IndexOptInfo *info;
 			int			ncolumns,
-						nkeycolumns;
-			int			i;
+						nkeycolumns,
+						i;
 
 			/*
 			 * Extract info from the relation descriptor for the index.
@@ -231,16 +235,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 				continue;
 			}
 
-			/*
-			 * Ignore partitioned indexes, since they are not usable for
-			 * queries.
-			 */
-			if (indexRelation->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
-			{
-				index_close(indexRelation, NoLock);
-				continue;
-			}
-
 			/*
 			 * If the index is valid, but cannot yet be used, ignore it; but
 			 * mark the plan we are generating as transient. See
@@ -285,108 +279,133 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 
 			info->relam = indexRelation->rd_rel->relam;
 
-			/* We copy just the fields we need, not all of rd_indam */
-			amroutine = indexRelation->rd_indam;
-			info->amcanorderbyop = amroutine->amcanorderbyop;
-			info->amoptionalkey = amroutine->amoptionalkey;
-			info->amsearcharray = amroutine->amsearcharray;
-			info->amsearchnulls = amroutine->amsearchnulls;
-			info->amcanparallel = amroutine->amcanparallel;
-			info->amhasgettuple = (amroutine->amgettuple != NULL);
-			info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
-				relation->rd_tableam->scan_bitmap_next_block != NULL;
-			info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
-								  amroutine->amrestrpos != NULL);
-			info->amcostestimate = amroutine->amcostestimate;
-			Assert(info->amcostestimate != NULL);
-
-			/* Fetch index opclass options */
-			info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
-
 			/*
-			 * Fetch the ordering information for the index, if any.
+			 * We don't have am for partitioned indexes, therefore we skip
+			 * gathering the info. This is ok, because we don't use
+			 * partitioned indexes in paths. We resolve inheritance before
+			 * generating paths.
 			 */
-			if (info->relam == BTREE_AM_OID)
+			if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
 			{
+				/* We copy just the fields we need, not all of rd_indam */
+				amroutine = indexRelation->rd_indam;
+				info->amcanorderbyop = amroutine->amcanorderbyop;
+				info->amoptionalkey = amroutine->amoptionalkey;
+				info->amsearcharray = amroutine->amsearcharray;
+				info->amsearchnulls = amroutine->amsearchnulls;
+				info->amcanparallel = amroutine->amcanparallel;
+				info->amhasgettuple = (amroutine->amgettuple != NULL);
+				info->amhasgetbitmap = amroutine->amgetbitmap != NULL &&
+					relation->rd_tableam->scan_bitmap_next_block != NULL;
+				info->amcanmarkpos = (amroutine->ammarkpos != NULL &&
+					amroutine->amrestrpos != NULL);
+				info->amcostestimate = amroutine->amcostestimate;
+				Assert(info->amcostestimate != NULL);
+
+				/* Fetch index opclass options */
+				info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true);
+
 				/*
-				 * If it's a btree index, we can use its opfamily OIDs
-				 * directly as the sort ordering opfamily OIDs.
+				 * Fetch the ordering information for the index, if any.
 				 */
-				Assert(amroutine->amcanorder);
-
-				info->sortopfamily = info->opfamily;
-				info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-				info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
-
-				for (i = 0; i < nkeycolumns; i++)
+				if (info->relam == BTREE_AM_OID)
 				{
-					int16		opt = indexRelation->rd_indoption[i];
+					/*
+					 * If it's a btree index, we can use its opfamily OIDs
+					 * directly as the sort ordering opfamily OIDs.
+					 */
+					Assert(amroutine->amcanorder);
 
-					info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-					info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-				}
-			}
-			else if (amroutine->amcanorder)
-			{
-				/*
-				 * Otherwise, identify the corresponding btree opfamilies by
-				 * trying to map this index's "<" operators into btree.  Since
-				 * "<" uniquely defines the behavior of a sort order, this is
-				 * a sufficient test.
-				 *
-				 * XXX This method is rather slow and also requires the
-				 * undesirable assumption that the other index AM numbers its
-				 * strategies the same as btree.  It'd be better to have a way
-				 * to explicitly declare the corresponding btree opfamily for
-				 * each opfamily of the other index type.  But given the lack
-				 * of current or foreseeable amcanorder index types, it's not
-				 * worth expending more effort on now.
-				 */
-				info->sortopfamily = (Oid *) palloc(sizeof(Oid) * nkeycolumns);
-				info->reverse_sort = (bool *) palloc(sizeof(bool) * nkeycolumns);
-				info->nulls_first = (bool *) palloc(sizeof(bool) * nkeycolumns);
+					info->sortopfamily = info->opfamily;
+					info->reverse_sort = (bool *)palloc(sizeof(bool) * nkeycolumns);
+					info->nulls_first = (bool *)palloc(sizeof(bool) * nkeycolumns);
 
-				for (i = 0; i < nkeycolumns; i++)
-				{
-					int16		opt = indexRelation->rd_indoption[i];
-					Oid			ltopr;
-					Oid			btopfamily;
-					Oid			btopcintype;
-					int16		btstrategy;
-
-					info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
-					info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
-
-					ltopr = get_opfamily_member(info->opfamily[i],
-												info->opcintype[i],
-												info->opcintype[i],
-												BTLessStrategyNumber);
-					if (OidIsValid(ltopr) &&
-						get_ordering_op_properties(ltopr,
-												   &btopfamily,
-												   &btopcintype,
-												   &btstrategy) &&
-						btopcintype == info->opcintype[i] &&
-						btstrategy == BTLessStrategyNumber)
+					for (i = 0; i < nkeycolumns; i++)
 					{
-						/* Successful mapping */
-						info->sortopfamily[i] = btopfamily;
+						int16		opt = indexRelation->rd_indoption[i];
+
+						info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+						info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
 					}
-					else
+				}
+				else if (amroutine->amcanorder)
+				{
+					/*
+					 * Otherwise, identify the corresponding btree opfamilies by
+					 * trying to map this index's "<" operators into btree.  Since
+					 * "<" uniquely defines the behavior of a sort order, this is
+					 * a sufficient test.
+					 *
+					 * XXX This method is rather slow and also requires the
+					 * undesirable assumption that the other index AM numbers its
+					 * strategies the same as btree.  It'd be better to have a way
+					 * to explicitly declare the corresponding btree opfamily for
+					 * each opfamily of the other index type.  But given the lack
+					 * of current or foreseeable amcanorder index types, it's not
+					 * worth expending more effort on now.
+					 */
+					info->sortopfamily = (Oid *)palloc(sizeof(Oid) * nkeycolumns);
+					info->reverse_sort = (bool *)palloc(sizeof(bool) * nkeycolumns);
+					info->nulls_first = (bool *)palloc(sizeof(bool) * nkeycolumns);
+
+					for (i = 0; i < nkeycolumns; i++)
 					{
-						/* Fail ... quietly treat index as unordered */
-						info->sortopfamily = NULL;
-						info->reverse_sort = NULL;
-						info->nulls_first = NULL;
-						break;
+						int16		opt = indexRelation->rd_indoption[i];
+						Oid			ltopr;
+						Oid			btopfamily;
+						Oid			btopcintype;
+						int16		btstrategy;
+
+						info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0;
+						info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0;
+
+						ltopr = get_opfamily_member(info->opfamily[i],
+							info->opcintype[i],
+							info->opcintype[i],
+							BTLessStrategyNumber);
+						if (OidIsValid(ltopr) &&
+							get_ordering_op_properties(ltopr,
+								&btopfamily,
+								&btopcintype,
+								&btstrategy) &&
+							btopcintype == info->opcintype[i] &&
+							btstrategy == BTLessStrategyNumber)
+						{
+							/* Successful mapping */
+							info->sortopfamily[i] = btopfamily;
+						}
+						else
+						{
+							/* Fail ... quietly treat index as unordered */
+							info->sortopfamily = NULL;
+							info->reverse_sort = NULL;
+							info->nulls_first = NULL;
+							break;
+						}
 					}
 				}
+				else
+				{
+					info->sortopfamily = NULL;
+					info->reverse_sort = NULL;
+					info->nulls_first = NULL;
+				}
 			}
 			else
 			{
 				info->sortopfamily = NULL;
 				info->reverse_sort = NULL;
 				info->nulls_first = NULL;
+
+				info->amcanorderbyop = false;
+				info->amoptionalkey = false;
+				info->amsearcharray = false;
+				info->amsearchnulls = false;
+				info->amcanparallel = false;
+				info->amhasgettuple = false;
+				info->amhasgetbitmap = false;
+				info->amcanmarkpos = false;
+				info->amcostestimate = NULL;
 			}
 
 			/*
@@ -411,38 +430,42 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			info->immediate = index->indimmediate;
 			info->hypothetical = false;
 
-			/*
-			 * Estimate the index size.  If it's not a partial index, we lock
-			 * the number-of-tuples estimate to equal the parent table; if it
-			 * is partial then we have to use the same methods as we would for
-			 * a table, except we can be sure that the index is not larger
-			 * than the table.
-			 */
-			if (info->indpred == NIL)
-			{
-				info->pages = RelationGetNumberOfBlocks(indexRelation);
-				info->tuples = rel->tuples;
-			}
-			else
-			{
-				double		allvisfrac; /* dummy */
-
-				estimate_rel_size(indexRelation, NULL,
-								  &info->pages, &info->tuples, &allvisfrac);
-				if (info->tuples > rel->tuples)
-					info->tuples = rel->tuples;
-			}
-
-			if (info->relam == BTREE_AM_OID)
+			/* Skip size and btree calculation if there isn't really a btree. */
+			if (indexRelation->rd_rel->relkind != RELKIND_PARTITIONED_INDEX)
 			{
-				/* For btrees, get tree height while we have the index open */
-				info->tree_height = _bt_getrootheight(indexRelation);
-			}
-			else
-			{
-				/* For other index types, just set it to "unknown" for now */
-				info->tree_height = -1;
-			}
+                            /*
+                             * Estimate the index size.  If it's not a partial index, we lock
+                             * the number-of-tuples estimate to equal the parent table; if it
+                             * is partial then we have to use the same methods as we would for
+                             * a table, except we can be sure that the index is not larger
+                             * than the table.
+                             */
+                            if (info->indpred == NIL)
+                            {
+                                    info->pages = RelationGetNumberOfBlocks(indexRelation);
+                                    info->tuples = rel->tuples;
+                            }
+                            else
+                            {
+                                    double		allvisfrac; /* dummy */
+
+                                    estimate_rel_size(indexRelation, NULL,
+                                                                      &info->pages, &info->tuples, &allvisfrac);
+                                    if (info->tuples > rel->tuples)
+                                            info->tuples = rel->tuples;
+                            }
+
+                            if (info->relam == BTREE_AM_OID)
+                            {
+                                    /* For btrees, get tree height while we have the index open */
+                                    info->tree_height = _bt_getrootheight(indexRelation);
+                            }
+                            else
+                            {
+                                    /* For other index types, just set it to "unknown" for now */
+                                    info->tree_height = -1;
+                            }
+                        }
 
 			index_close(indexRelation, NoLock);
 
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 48858a871a..878e6e2f2b 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -5993,6 +5993,10 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
 	rte = root->simple_rte_array[rel->relid];
 	Assert(rte->rtekind == RTE_RELATION);
 
+	/* ignore partitioned tables.  Any indexes here are not real indexes */
+	if (rte->relkind == RELKIND_PARTITIONED_TABLE)
+		return false;
+
 	/* Search through the indexes to see if any match our problem */
 	foreach(lc, rel->indexlist)
 	{
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 12624e6adb..b3ccff1017 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -634,7 +634,7 @@ typedef struct PartitionSchemeData *PartitionScheme;
  *		lateral_relids - required outer rels for LATERAL, as a Relids set
  *			(includes both direct and indirect lateral references)
  *
- * If the relation is a base relation it will have these fields set:
+ * If the relation is a base relation, it will have these fields set:
  *
  *		relid - RTE index (this is redundant with the relids field, but
  *				is provided for convenience of access)
@@ -649,8 +649,6 @@ typedef struct PartitionSchemeData *PartitionScheme;
  *					   Vars and PlaceHolderVars)
  *		lateral_referencers - relids of rels that reference this one laterally
  *				(includes both direct and indirect lateral references)
- *		indexlist - list of IndexOptInfo nodes for relation's indexes
- *					(always NIL if it's not a table)
  *		pages - number of disk pages in relation (zero if not a table)
  *		tuples - number of tuples in relation (not considering restrictions)
  *		allvisfrac - fraction of disk pages that are marked all-visible
@@ -666,6 +664,13 @@ typedef struct PartitionSchemeData *PartitionScheme;
  *		For otherrels that are appendrel members, these fields are filled
  *		in just as for a baserel, except we don't bother with lateral_vars.
  *
+ * If the relation is either a base relation or a partitioned table, it will have a set:
+ *		indexlist - list of IndexOptInfo nodes for relation's indexes
+ *		                        (for patitioned indexes not all of the
+ *		                            IndexOptInfo fields are set, since they
+ *		                            can't be accessed)
+ *					(always NIL if it's not a table)
+ *
  * If the relation is either a foreign table or a join of foreign tables that
  * all belong to the same foreign server and are assigned to the same user to
  * check access permissions as (cf checkAsUser), these fields will be set:
diff --git a/src/test/regress/expected/inherit.out b/src/test/regress/expected/inherit.out
index 2d49e765de..80fc76ed32 100644
--- a/src/test/regress/expected/inherit.out
+++ b/src/test/regress/expected/inherit.out
@@ -1,6 +1,8 @@
 --
 -- Test inheritance features
 --
+-- XXX These tables are not dropped. Why?
+-- XXX Are they needed for later tests? For dump-restore? Dropping them doesn't fail any test from check-world.
 CREATE TABLE a (aa TEXT);
 CREATE TABLE b (bb TEXT) INHERITS (a);
 CREATE TABLE c (cc TEXT) INHERITS (a);
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
index b8d43e4c14..822230da3a 100644
--- a/src/test/regress/expected/join.out
+++ b/src/test/regress/expected/join.out
@@ -4699,7 +4699,8 @@ select a.unique1, b.unique2
 (1 row)
 
 --
--- test join removal
+-- test join removal for plain tables
+-- we have almost the smare tests for partitioned tables below
 --
 begin;
 CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int);
@@ -4831,6 +4832,55 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
          Filter: (a.id = i)
 (4 rows)
 
+rollback;
+--
+-- test join removal for partitioned tables
+-- this is analog to the non partitioned case above
+--
+begin;
+CREATE TABLE b_p (id int PRIMARY KEY, c_id int) partition by range(id);
+CREATE TABLE b_m partition of b_p for values from (0) to (10) partition by range(id);
+CREATE TABLE b_c partition of b_m for values from (0) to (10);
+CREATE TABLE c_p (id int PRIMARY KEY) partition by range(id);
+CREATE TABLE c_m partition of c_p for values from (0) to (10) partition by range(id);
+CREATE TABLE c_c partition of c_m for values from (0) to (10);
+INSERT INTO b_p VALUES (0, 0), (1, NULL);
+INSERT INTO c_p VALUES (0), (1);
+-- all three cases should be optimizable into a simple seqscan
+EXPLAIN (COSTS OFF) SELECT b0.* FROM b_p b0 LEFT JOIN b_p b ON b0.c_id = b.id;
+     QUERY PLAN     
+--------------------
+ Seq Scan on b_c b0
+(1 row)
+
+EXPLAIN (COSTS OFF) SELECT b.* FROM b_p b LEFT JOIN c_p c ON b.c_id = c.id;
+    QUERY PLAN     
+-------------------
+ Seq Scan on b_c b
+(1 row)
+
+EXPLAIN (COSTS OFF)
+  SELECT b0.* FROM b_p b0 LEFT JOIN (b_p b LEFT JOIN c_p c on b.c_id = c.id)
+  ON (b0.c_id = b.id);
+     QUERY PLAN     
+--------------------
+ Seq Scan on b_c b0
+(1 row)
+
+-- check optimization of outer join within another special join
+EXPLAIN (COSTS OFF)
+select id from b_p b0 WHERE id in (
+	SELECT b.id FROM b_p b LEFT JOIN c_p c on b.id = c.id
+);
+          QUERY PLAN           
+-------------------------------
+ Hash Join
+   Hash Cond: (b0.id = b.id)
+   ->  Seq Scan on b_c b0
+   ->  Hash
+         ->  Seq Scan on b_c b
+(5 rows)
+
 rollback;
 create temp table parent (k int primary key, pd int);
 create temp table child (k int unique, cd int);
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index b20facc19f..dd8ad1c71a 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -4867,13 +4867,13 @@ CREATE TABLE fract_t (id BIGINT, PRIMARY KEY (id)) PARTITION BY RANGE (id);
 CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
 -- insert data
-INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
+INSERT INTO fract_t (id) SELECT i FROM generate_series(0, 1999) i;
 ANALYZE fract_t;
 -- verify plan; nested index only scans
 SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
                               QUERY PLAN                               
 -----------------------------------------------------------------------
  Limit
@@ -4890,7 +4890,7 @@ SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
 (11 rows)
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
                                    QUERY PLAN                                   
 --------------------------------------------------------------------------------
  Limit
diff --git a/src/test/regress/sql/inherit.sql b/src/test/regress/sql/inherit.sql
index 195aedb5ff..69600bb068 100644
--- a/src/test/regress/sql/inherit.sql
+++ b/src/test/regress/sql/inherit.sql
@@ -1,6 +1,10 @@
 --
 -- Test inheritance features
 --
+
+-- XXX These tables are not dropped. Why?
+-- XXX Are they needed for later tests? For dump-restore? Dropping them doesn't fail any test from check-world.
+
 CREATE TABLE a (aa TEXT);
 CREATE TABLE b (bb TEXT) INHERITS (a);
 CREATE TABLE c (cc TEXT) INHERITS (a);
diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql
index 65aab85c35..8424473630 100644
--- a/src/test/regress/sql/join.sql
+++ b/src/test/regress/sql/join.sql
@@ -1627,7 +1627,8 @@ select a.unique1, b.unique2
   where b.unique2 = any (select q1 from int8_tbl c where c.q1 < b.unique1);
 
 --
--- test join removal
+-- test join removal for plain tables
+-- we have almost the smare tests for partitioned tables below
 --
 
 begin;
@@ -1696,6 +1697,37 @@ select 1 from (select a.id FROM a left join b on a.b_id = b.id) q,
 
 rollback;
 
+--
+-- test join removal for partitioned tables
+-- this is analog to the non partitioned case above
+--
+
+begin;
+
+CREATE TABLE b_p (id int PRIMARY KEY, c_id int) partition by range(id);
+CREATE TABLE b_m partition of b_p for values from (0) to (10) partition by range(id);
+CREATE TABLE b_c partition of b_m for values from (0) to (10);
+CREATE TABLE c_p (id int PRIMARY KEY) partition by range(id);
+CREATE TABLE c_m partition of c_p for values from (0) to (10) partition by range(id);
+CREATE TABLE c_c partition of c_m for values from (0) to (10);
+INSERT INTO b_p VALUES (0, 0), (1, NULL);
+INSERT INTO c_p VALUES (0), (1);
+
+-- all three cases should be optimizable into a simple seqscan
+EXPLAIN (COSTS OFF) SELECT b0.* FROM b_p b0 LEFT JOIN b_p b ON b0.c_id = b.id;
+EXPLAIN (COSTS OFF) SELECT b.* FROM b_p b LEFT JOIN c_p c ON b.c_id = c.id;
+EXPLAIN (COSTS OFF)
+  SELECT b0.* FROM b_p b0 LEFT JOIN (b_p b LEFT JOIN c_p c on b.c_id = c.id)
+  ON (b0.c_id = b.id);
+
+-- check optimization of outer join within another special join
+EXPLAIN (COSTS OFF)
+select id from b_p b0 WHERE id in (
+	SELECT b.id FROM b_p b LEFT JOIN c_p c on b.id = c.id
+);
+
+rollback;
+
 create temp table parent (k int primary key, pd int);
 create temp table child (k int unique, cd int);
 insert into parent values (1, 10), (2, 20), (3, 30);
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 67f506361f..94f48a7c51 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -1149,7 +1149,7 @@ CREATE TABLE fract_t0 PARTITION OF fract_t FOR VALUES FROM ('0') TO ('1000');
 CREATE TABLE fract_t1 PARTITION OF fract_t FOR VALUES FROM ('1000') TO ('2000');
 
 -- insert data
-INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
+INSERT INTO fract_t (id) SELECT i FROM generate_series(0, 1999) i;
 ANALYZE fract_t;
 
 -- verify plan; nested index only scans
@@ -1157,10 +1157,10 @@ SET max_parallel_workers_per_gather = 0;
 SET enable_partitionwise_join = on;
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id ASC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
 
 EXPLAIN (COSTS OFF)
-SELECT * FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY id DESC LIMIT 10;
+SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
 
 -- cleanup
 DROP TABLE fract_t;
-- 
2.35.3

#31