From 3d846ff4f4f311ade1e8c7afe1e7481249ca6d89 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 7 Jun 2020 16:58:42 -0500
Subject: [PATCH v10 2/8] Implement CLUSTER of partitioned table..

This requires either specification of a partitioned index on which to cluster,
or that an partitioned index was previously set clustered.

VACUUM (including vacuum full) has recursed into partition hierarchies since
partitions were introduced in v10 (3c3bb9933).
---
 doc/src/sgml/ref/cluster.sgml         |   7 ++
 src/backend/commands/cluster.c        | 173 +++++++++++++++++++-------
 src/bin/psql/tab-complete.c           |   1 +
 src/include/commands/cluster.h        |   1 +
 src/test/regress/expected/cluster.out |  58 ++++++++-
 src/test/regress/sql/cluster.sql      |  24 +++-
 6 files changed, 209 insertions(+), 55 deletions(-)

diff --git a/doc/src/sgml/ref/cluster.sgml b/doc/src/sgml/ref/cluster.sgml
index 0d9720fd8e..17509b35eb 100644
--- a/doc/src/sgml/ref/cluster.sgml
+++ b/doc/src/sgml/ref/cluster.sgml
@@ -197,6 +197,13 @@ CLUSTER [VERBOSE]
     in the <structname>pg_stat_progress_cluster</structname> view. See
     <xref linkend="cluster-progress-reporting"/> for details.
   </para>
+
+   <para>
+    Clustering a partitioned table clusters each of its partitions using the
+    partition of the specified partitioned index or (if not specified) the
+    partitioned index marked as clustered.
+   </para>
+
  </refsect1>
 
  <refsect1>
diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 096a06f7b3..0c08ac56dc 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -32,7 +32,9 @@
 #include "catalog/index.h"
 #include "catalog/namespace.h"
 #include "catalog/objectaccess.h"
+#include "catalog/partition.h"
 #include "catalog/pg_am.h"
+#include "catalog/pg_inherits.h"
 #include "catalog/toasting.h"
 #include "commands/cluster.h"
 #include "commands/defrem.h"
@@ -73,6 +75,9 @@ static void copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex,
 							bool verbose, bool *pSwapToastByContent,
 							TransactionId *pFreezeXid, MultiXactId *pCutoffMulti);
 static List *get_tables_to_cluster(MemoryContext cluster_context);
+static List *get_tables_to_cluster_partitioned(MemoryContext cluster_context,
+		Oid indexOid);
+static void cluster_multiple_rels(List *rvs, int options);
 
 
 /*---------------------------------------------------------------------------
@@ -135,7 +140,7 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 											AccessExclusiveLock,
 											0,
 											RangeVarCallbackOwnsTable, NULL);
-		rel = table_open(tableOid, NoLock);
+		rel = table_open(tableOid, ShareUpdateExclusiveLock);
 
 		/*
 		 * Reject clustering a remote temp table ... their local buffer
@@ -146,14 +151,6 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("cannot cluster temporary tables of other sessions")));
 
-		/*
-		 * Reject clustering a partitioned table.
-		 */
-		if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("cannot cluster a partitioned table")));
-
 		if (stmt->indexname == NULL)
 		{
 			ListCell   *index;
@@ -189,10 +186,34 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 		}
 
 		/* close relation, keep lock till commit */
-		table_close(rel, NoLock);
+		table_close(rel, ShareUpdateExclusiveLock);
 
-		/* Do the job. */
-		cluster_rel(tableOid, indexOid, &params);
+		if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE)
+		{
+			/* Do the job. */
+			cluster_rel(tableOid, indexOid, &params);
+		}
+		else
+		{
+			List	   *rvs;
+			MemoryContext cluster_context;
+
+			/* Refuse to hold strong locks in a user transaction */
+			PreventInTransactionBlock(isTopLevel, "CLUSTER");
+
+			cluster_context = AllocSetContextCreate(PortalContext,
+												"Cluster",
+												ALLOCSET_DEFAULT_SIZES);
+
+			rvs = get_tables_to_cluster_partitioned(cluster_context, indexOid);
+			cluster_multiple_rels(rvs, params.options);
+
+			/* Start a new transaction for the cleanup work. */
+			StartTransactionCommand();
+
+			/* Clean up working storage */
+			MemoryContextDelete(cluster_context);
+		}
 	}
 	else
 	{
@@ -202,7 +223,6 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 		 */
 		MemoryContext cluster_context;
 		List	   *rvs;
-		ListCell   *rv;
 
 		/*
 		 * We cannot run this form of CLUSTER inside a user transaction block;
@@ -225,28 +245,7 @@ cluster(ParseState *pstate, ClusterStmt *stmt, bool isTopLevel)
 		 * cluster_context.
 		 */
 		rvs = get_tables_to_cluster(cluster_context);
-
-		/* Commit to get out of starting transaction */
-		PopActiveSnapshot();
-		CommitTransactionCommand();
-
-		/* Ok, now that we've got them all, cluster them one by one */
-		foreach(rv, rvs)
-		{
-			RelToCluster *rvtc = (RelToCluster *) lfirst(rv);
-			ClusterParams cluster_params = params;
-
-			/* Start a new transaction for each relation. */
-			StartTransactionCommand();
-			/* functions in indexes may want a snapshot set */
-			PushActiveSnapshot(GetTransactionSnapshot());
-			/* Do the job. */
-			cluster_params.options |= CLUOPT_RECHECK;
-			cluster_rel(rvtc->tableOid, rvtc->indexOid,
-						&cluster_params);
-			PopActiveSnapshot();
-			CommitTransactionCommand();
-		}
+		cluster_multiple_rels(rvs, params.options | CLUOPT_RECHECK_ISCLUSTERED);
 
 		/* Start a new transaction for the cleanup work. */
 		StartTransactionCommand();
@@ -352,9 +351,10 @@ cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params)
 			}
 
 			/*
-			 * Check that the index is still the one with indisclustered set.
+			 * Check that the index is still the one with indisclustered set, if needed.
 			 */
-			if (!get_index_isclustered(indexOid))
+			if ((params->options & CLUOPT_RECHECK_ISCLUSTERED) != 0 &&
+					!get_index_isclustered(indexOid))
 			{
 				relation_close(OldHeap, AccessExclusiveLock);
 				pgstat_progress_end_command();
@@ -398,8 +398,13 @@ cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params)
 
 	/* Check heap and index are valid to cluster on */
 	if (OidIsValid(indexOid))
+	{
 		check_index_is_clusterable(OldHeap, indexOid, recheck, AccessExclusiveLock);
 
+		/* Mark the index as clustered */
+		mark_index_clustered(OldHeap, indexOid, true);
+	}
+
 	/*
 	 * Quietly ignore the request if this is a materialized view which has not
 	 * been populated from its query. No harm is done because there is no data
@@ -415,6 +420,14 @@ cluster_rel(Oid tableOid, Oid indexOid, ClusterParams *params)
 		return;
 	}
 
+	/* For a partitioned rel, we're done. */
+	if (!RELKIND_HAS_STORAGE(get_rel_relkind(tableOid)))
+	{
+		relation_close(OldHeap, AccessExclusiveLock);
+		pgstat_progress_end_command();
+		return;
+	}
+
 	/*
 	 * All predicate locks on the tuples or pages are about to be made
 	 * invalid, because we move tuples around.  Promote them to relation
@@ -483,6 +496,9 @@ check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMOD
 	 * the worst consequence of following broken HOT chains would be that we
 	 * might put recently-dead tuples out-of-order in the new table, and there
 	 * is little harm in that.)
+	 *
+	 * This also refuses to cluster on an "incomplete" partitioned index
+	 * created with "ON ONLY".
 	 */
 	if (!OldIndex->rd_index->indisvalid)
 		ereport(ERROR,
@@ -507,12 +523,6 @@ mark_index_clustered(Relation rel, Oid indexOid, bool is_internal)
 	Relation	pg_index;
 	ListCell   *index;
 
-	/* Disallow applying to a partitioned table */
-	if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot mark index clustered in partitioned table")));
-
 	/*
 	 * If the index is already marked clustered, no need to do anything.
 	 */
@@ -584,10 +594,6 @@ rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose)
 	TransactionId frozenXid;
 	MultiXactId cutoffMulti;
 
-	/* Mark the correct index as clustered */
-	if (OidIsValid(indexOid))
-		mark_index_clustered(OldHeap, indexOid, true);
-
 	/* Remember info about rel before closing OldHeap */
 	relpersistence = OldHeap->rd_rel->relpersistence;
 	is_system_catalog = IsSystemRelation(OldHeap);
@@ -1582,3 +1588,76 @@ get_tables_to_cluster(MemoryContext cluster_context)
 
 	return rvs;
 }
+
+/*
+ * Return a List of tables and associated index, where each index is a
+ * partition of the given index
+ */
+static List *
+get_tables_to_cluster_partitioned(MemoryContext cluster_context, Oid indexOid)
+{
+	List		*inhoids;
+	ListCell	*lc;
+	List		*rvs = NIL;
+	MemoryContext	old_context;
+
+	inhoids = find_all_inheritors(indexOid, NoLock, NULL);
+
+	/*
+	 * We have to build the list in a different memory context so it will
+	 * survive the cross-transaction processing
+	 */
+	old_context = MemoryContextSwitchTo(cluster_context);
+
+	foreach(lc, inhoids)
+	{
+		Oid		indexrelid = lfirst_oid(lc);
+		Oid		relid = IndexGetRelation(indexrelid, false);
+		RelToCluster	*rvtc;
+
+		/*
+		 * Partitioned rels (including the top indexOid) are included,
+		 * so as to be processed by cluster_rel, which calls
+		 * check_index_is_clusterable() and mark_index_clustered().
+		 */
+		rvtc = (RelToCluster *) palloc(sizeof(RelToCluster));
+		rvtc->tableOid = relid;
+		rvtc->indexOid = indexrelid;
+		rvs = lappend(rvs, rvtc);
+	}
+
+	MemoryContextSwitchTo(old_context);
+	return rvs;
+}
+
+/* Cluster each relation in a separate transaction */
+static void
+cluster_multiple_rels(List *rvs, int options)
+{
+	ListCell *lc;
+
+	/* Commit to get out of starting transaction */
+	PopActiveSnapshot();
+	CommitTransactionCommand();
+
+	/* Ok, now that we've got them all, cluster them one by one */
+	foreach(lc, rvs)
+	{
+		RelToCluster *rvtc = (RelToCluster *) lfirst(lc);
+		ClusterParams cluster_params = { .options = options, };
+
+		/* Start a new transaction for each relation. */
+		StartTransactionCommand();
+
+		/* functions in indexes may want a snapshot set */
+		PushActiveSnapshot(GetTransactionSnapshot());
+
+		/* Do the job. */
+		cluster_params.options |= CLUOPT_RECHECK;
+		cluster_rel(rvtc->tableOid, rvtc->indexOid,
+					&cluster_params);
+
+		PopActiveSnapshot();
+		CommitTransactionCommand();
+	}
+}
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index a053bc1e45..3fd192b8d2 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -588,6 +588,7 @@ static const SchemaQuery Query_for_list_of_clusterables = {
 	.catname = "pg_catalog.pg_class c",
 	.selcondition =
 	"c.relkind IN (" CppAsString2(RELKIND_RELATION) ", "
+	CppAsString2(RELKIND_PARTITIONED_TABLE) ", "
 	CppAsString2(RELKIND_MATVIEW) ")",
 	.viscondition = "pg_catalog.pg_table_is_visible(c.oid)",
 	.namespace = "c.relnamespace",
diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h
index a941f2accd..c30ca01726 100644
--- a/src/include/commands/cluster.h
+++ b/src/include/commands/cluster.h
@@ -22,6 +22,7 @@
 /* flag bits for ClusterParams->flags */
 #define CLUOPT_RECHECK 0x01		/* recheck relation state */
 #define CLUOPT_VERBOSE 0x02		/* print progress info */
+#define CLUOPT_RECHECK_ISCLUSTERED 0x04	/* recheck relation state for indisclustered */
 
 /* options for CLUSTER */
 typedef struct ClusterParams
diff --git a/src/test/regress/expected/cluster.out b/src/test/regress/expected/cluster.out
index e46a66952f..8fb496434e 100644
--- a/src/test/regress/expected/cluster.out
+++ b/src/test/regress/expected/cluster.out
@@ -444,14 +444,62 @@ DROP TABLE clustertest;
 CREATE TABLE clustertest (f1 int PRIMARY KEY);
 CLUSTER clustertest USING clustertest_pkey;
 CLUSTER clustertest;
--- Check that partitioned tables cannot be clustered
+-- Check that partitioned tables can be clustered
 CREATE TABLE clstrpart (a int) PARTITION BY RANGE (a);
+CREATE TABLE clstrpart1 PARTITION OF clstrpart FOR VALUES FROM (1)TO(10) PARTITION BY RANGE (a);
+CREATE TABLE clstrpart11 PARTITION OF clstrpart1 FOR VALUES FROM (1)TO(10);
+CREATE TABLE clstrpart12 PARTITION OF clstrpart1 FOR VALUES FROM (10)TO(20) PARTITION BY RANGE(a);
+CREATE TABLE clstrpart2 PARTITION OF clstrpart FOR VALUES FROM (20)TO(30);
+CREATE TABLE clstrpart3 PARTITION OF clstrpart DEFAULT PARTITION BY RANGE(a);
+CREATE TABLE clstrpart33 PARTITION OF clstrpart3 DEFAULT;
+ALTER TABLE clstrpart SET WITHOUT CLUSTER;
+CREATE INDEX clstrpart_only_idx ON ONLY clstrpart (a);
+CLUSTER clstrpart USING clstrpart_only_idx; -- fails
+ERROR:  cannot cluster on invalid index "clstrpart_only_idx"
+DROP INDEX clstrpart_only_idx;
 CREATE INDEX clstrpart_idx ON clstrpart (a);
-ALTER TABLE clstrpart CLUSTER ON clstrpart_idx;
-ERROR:  cannot mark index clustered in partitioned table
+-- Check that clustering sets new relfilenodes:
+CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ;
 CLUSTER clstrpart USING clstrpart_idx;
-ERROR:  cannot cluster a partitioned table
-DROP TABLE clstrpart;
+CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ;
+SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C";
+   relname   | level | relkind | ?column? 
+-------------+-------+---------+----------
+ clstrpart   |     0 | p       | t
+ clstrpart1  |     1 | p       | t
+ clstrpart11 |     2 | r       | f
+ clstrpart12 |     2 | p       | t
+ clstrpart2  |     1 | r       | f
+ clstrpart3  |     1 | p       | t
+ clstrpart33 |     2 | r       | f
+(7 rows)
+
+-- Check that clustering sets new indisclustered:
+SELECT relname, relkind, indisclustered FROM pg_partition_tree('clstrpart_idx'::regclass) AS tree JOIN pg_index i ON i.indexrelid=tree.relid JOIN pg_class c ON c.oid=indexrelid ORDER BY relname COLLATE "C";
+      relname      | relkind | indisclustered 
+-------------------+---------+----------------
+ clstrpart11_a_idx | i       | t
+ clstrpart12_a_idx | I       | t
+ clstrpart1_a_idx  | I       | t
+ clstrpart2_a_idx  | i       | t
+ clstrpart33_a_idx | i       | t
+ clstrpart3_a_idx  | I       | t
+ clstrpart_idx     | I       | t
+(7 rows)
+
+CLUSTER clstrpart1 USING clstrpart1_a_idx; -- partition which is itself partitioned
+CLUSTER clstrpart12 USING clstrpart12_a_idx; -- partition which is itself partitioned, no childs
+CLUSTER clstrpart2 USING clstrpart2_a_idx; -- leaf
+\d clstrpart
+       Partitioned table "public.clstrpart"
+ Column |  Type   | Collation | Nullable | Default 
+--------+---------+-----------+----------+---------
+ a      | integer |           |          | 
+Partition key: RANGE (a)
+Indexes:
+    "clstrpart_idx" btree (a) CLUSTER
+Number of partitions: 3 (Use \d+ to list them.)
+
 -- Test CLUSTER with external tuplesorting
 create table clstr_4 as select * from tenk1;
 create index cluster_sort on clstr_4 (hundred, thousand, tenthous);
diff --git a/src/test/regress/sql/cluster.sql b/src/test/regress/sql/cluster.sql
index aee9cf83e0..57fc661863 100644
--- a/src/test/regress/sql/cluster.sql
+++ b/src/test/regress/sql/cluster.sql
@@ -202,12 +202,30 @@ CREATE TABLE clustertest (f1 int PRIMARY KEY);
 CLUSTER clustertest USING clustertest_pkey;
 CLUSTER clustertest;
 
--- Check that partitioned tables cannot be clustered
+-- Check that partitioned tables can be clustered
 CREATE TABLE clstrpart (a int) PARTITION BY RANGE (a);
+CREATE TABLE clstrpart1 PARTITION OF clstrpart FOR VALUES FROM (1)TO(10) PARTITION BY RANGE (a);
+CREATE TABLE clstrpart11 PARTITION OF clstrpart1 FOR VALUES FROM (1)TO(10);
+CREATE TABLE clstrpart12 PARTITION OF clstrpart1 FOR VALUES FROM (10)TO(20) PARTITION BY RANGE(a);
+CREATE TABLE clstrpart2 PARTITION OF clstrpart FOR VALUES FROM (20)TO(30);
+CREATE TABLE clstrpart3 PARTITION OF clstrpart DEFAULT PARTITION BY RANGE(a);
+CREATE TABLE clstrpart33 PARTITION OF clstrpart3 DEFAULT;
+ALTER TABLE clstrpart SET WITHOUT CLUSTER;
+CREATE INDEX clstrpart_only_idx ON ONLY clstrpart (a);
+CLUSTER clstrpart USING clstrpart_only_idx; -- fails
+DROP INDEX clstrpart_only_idx;
 CREATE INDEX clstrpart_idx ON clstrpart (a);
-ALTER TABLE clstrpart CLUSTER ON clstrpart_idx;
+-- Check that clustering sets new relfilenodes:
+CREATE TEMP TABLE old_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ;
 CLUSTER clstrpart USING clstrpart_idx;
-DROP TABLE clstrpart;
+CREATE TEMP TABLE new_cluster_info AS SELECT relname, level, relfilenode, relkind FROM pg_partition_tree('clstrpart'::regclass) AS tree JOIN pg_class c ON c.oid=tree.relid ;
+SELECT relname, old.level, old.relkind, old.relfilenode = new.relfilenode FROM old_cluster_info AS old JOIN new_cluster_info AS new USING (relname) ORDER BY relname COLLATE "C";
+-- Check that clustering sets new indisclustered:
+SELECT relname, relkind, indisclustered FROM pg_partition_tree('clstrpart_idx'::regclass) AS tree JOIN pg_index i ON i.indexrelid=tree.relid JOIN pg_class c ON c.oid=indexrelid ORDER BY relname COLLATE "C";
+CLUSTER clstrpart1 USING clstrpart1_a_idx; -- partition which is itself partitioned
+CLUSTER clstrpart12 USING clstrpart12_a_idx; -- partition which is itself partitioned, no childs
+CLUSTER clstrpart2 USING clstrpart2_a_idx; -- leaf
+\d clstrpart
 
 -- Test CLUSTER with external tuplesorting
 
-- 
2.17.0

