From 0c48cf2daad6fba92623c4b9b7be60a60e1e3fab Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Tue, 23 Jul 2024 15:46:38 +0530 Subject: [PATCH v1 1/4] Catalog for (globalindexid, reloid) to PartitionId to mapping This is a base patch required for global indexes. Basically, global indexes stores tuple from multiple partitions of a partitioned relation so TIDs alone are insufficient for uniquely identifying tuples. We might consider storing the relation OID along with TID, as the combination of relation OID and TID is the most straightforward way to uniquely identify a heap tuple. However, this approach has its drawbacks. For instance, if a partition is detached, the global index would not know how to ignore the tuples from that detached partition unless it cleans out all tuples from that partition in the global index. Therefore, we need an identifier for each partition that is only valid while the partition is attached and becomes invalid once the partition is detached. We call this identifier a partition ID. However, when accessing tuples from the index, we still need to convert the partition ID to the heap OID at some point. One might assume a 1-to-1 mapping between partition IDs and relation OIDs, but this isn't feasible. In a multi-level partition hierarchy, detaching a partitioned table from a higher-level partitioned table invalidates the underlying leaf relation's partition ID for the global indexes of the parent from which it was detached. However, that partition ID should remain valid for the global indexes prersent at the lower level partitioned tables where the leaf partition is still attached. Therefore, for simplicity, we maintain a partition ID for each global index and leaf relation OID pair. This way, detaching a partition only requires invalidating the specific global index and partition ID combinations from which the leaf partition is being detached. This patch provides a catalog for storing these mappings and a cache for faster access. It also includes mechanisms for allocating partition IDs and managing insertions and deletions in the catalog. --- src/backend/catalog/Makefile | 4 +- src/backend/catalog/pg_index_partitions.c | 342 ++++++++++++++++++++++ src/include/c.h | 16 + src/include/catalog/Makefile | 3 +- src/include/catalog/pg_index_partitions.h | 84 ++++++ src/include/postgres.h | 19 ++ src/include/utils/rel.h | 7 + src/test/regress/expected/oidjoins.out | 2 + 8 files changed, 474 insertions(+), 3 deletions(-) create mode 100644 src/backend/catalog/pg_index_partitions.c create mode 100644 src/include/catalog/pg_index_partitions.h diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile index c090094ed0..275910eda6 100644 --- a/src/backend/catalog/Makefile +++ b/src/backend/catalog/Makefile @@ -46,8 +46,8 @@ OBJS = \ pg_subscription.o \ pg_type.o \ storage.o \ - toasting.o - + toasting.o \ + pg_index_partitions.o include $(top_srcdir)/src/backend/common.mk .PHONY: install-data diff --git a/src/backend/catalog/pg_index_partitions.c b/src/backend/catalog/pg_index_partitions.c new file mode 100644 index 0000000000..e637feb453 --- /dev/null +++ b/src/backend/catalog/pg_index_partitions.c @@ -0,0 +1,342 @@ +/*------------------------------------------------------------------------- + * + * pg_index_partitions.c + * + * routines to support manipulation of the pg_index_partitions relation and + * also provide cache over this relation for faster mapping from partition + * id to reloid for a global index. + * + * Notes(why we need pg_index_partitions relation): + * + * This mapping is required for global indexes. Basically, global indexes + * stores tuple from multiple partitions of a partitioned relation so TIDs + * alone are insufficient for uniquely identifying tuples. We might consider + * storing the relation OID along with TID, as the combination of relation OID + * and TID is the most straightforward way to uniquely identify a heap tuple. + * + * However, this approach has its drawbacks. For instance, if a partition is + * detached, the global index would not know how to ignore the tuples from that + * detached partition unless it cleans out all tuples from that partition in + * the global index. + * + * Therefore, we need an identifier for each partition that is only valid while + * the partition is attached and becomes invalid once the partition is + * detached. We call this identifier a partition ID. However, when accessing + * tuples from the index, we still need to convert the partition ID to the heap + * OID at some point. One might assume a 1-to-1 mapping between partition IDs + * and relation OIDs, but this isn't feasible. In a multi-level partition + * hierarchy, detaching a partitioned table from a higher-level partitioned + * table invalidates the underlying leaf relation's partition ID for the global + * indexes of the parent from which it was detached. However, that partition + * ID should remain valid for the global indexes prersent at the lower level + * partitioned tables where the leaf partition is still attached. Therefore, + * for simplicity, we maintain a partition ID for each global index and leaf + * relation OID pair. This way, detaching a partition only requires + * invalidating the specific global index and partition ID combinations from + * which the leaf partition is being detached. + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/catalog/pg_index_partitions.c + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/stratnum.h" +#include "access/table.h" +#include "catalog/indexing.h" +#include "catalog/catalog.h" +#include "catalog/pg_index_partitions.h" +#include "partitioning/partdesc.h" +#include "utils/fmgroids.h" +#include "utils/inval.h" +#include "utils/rel.h" + +/* + * InsertIndexPartitionEntry - Insert parition id to reloid mapping + * + * Insert (indexoid, partid) to reloid mapping into pg_index_partitions table. + */ +void +InsertIndexPartitionEntry(Relation irel, Oid reloid, PartitionId partid) +{ + Datum values[Natts_pg_index_partitions]; + bool nulls[Natts_pg_index_partitions]; + HeapTuple tuple; + Relation rel; + Oid indexoid = RelationGetRelid(irel); + + rel = table_open(IndexPartitionsRelationId, RowExclusiveLock); + + /* Make the pg_index_partitions entry. */ + values[Anum_pg_index_partitions_indexoid - 1] = ObjectIdGetDatum(indexoid); + values[Anum_pg_index_partitions_reloid - 1] = ObjectIdGetDatum(reloid); + values[Anum_pg_index_partitions_partid - 1] = PartitionIdGetDatum(partid); + + memset(nulls, 0, sizeof(nulls)); + + tuple = heap_form_tuple(RelationGetDescr(rel), values, nulls); + + CatalogTupleInsert(rel, tuple); + + heap_freetuple(tuple); + + table_close(rel, RowExclusiveLock); +} + +/* + * DeleteIndexPartitionEntries - Delete all index partition entries. + * + * This will delete all the entires for given global index id from + * pg_index_partitions table. This should only be called when global index + * is being dropped. + */ +void +DeleteIndexPartitionEntries(Oid indrelid) +{ + Relation catalogRelation; + ScanKeyData key; + SysScanDesc scan; + HeapTuple tuple; + + /* Find pg_index_partitions entries by indrelid. */ + catalogRelation = table_open(IndexPartitionsRelationId, RowExclusiveLock); + ScanKeyInit(&key, + Anum_pg_index_partitions_indexoid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(indrelid)); + + scan = systable_beginscan(catalogRelation, IndexPartitionsIndexId, true, + NULL, 1, &key); + while (HeapTupleIsValid(tuple = systable_getnext(scan))) + CatalogTupleDelete(catalogRelation, &tuple->t_self); + + /* Done */ + systable_endscan(scan); + table_close(catalogRelation, RowExclusiveLock); +} + +/* + * BuildIndexPartitionInfo - Cache for parittion id to reloid mapping + * + * Build a cache for faster access to the mappping from partition id to the + * relation oid. For more detail on this mapping refer to the comments in + * pg_index_partition.h and also atop PartitionId declaration in c.h. + */ +void +BuildIndexPartitionInfo(Relation relation, MemoryContext context) +{ + SysScanDesc scan; + ScanKeyData key; + HeapTuple tuple; + Relation rel; + PartitionId maxpartid = InvalidPartitionId; + IndexPartitionInfo map; + MemoryContext oldcontext; + HASHCTL ctl; + + /* + * Open pg_index_partition table for getting the partition id to reloid + * mapping for the input index relation. + */ + rel = table_open(IndexPartitionsRelationId, AccessShareLock); + + oldcontext = MemoryContextSwitchTo(context); + map = (IndexPartitionInfoData *) palloc0(sizeof(IndexPartitionInfoData)); + map->context = context; + + /* Make a new hash table for the cache */ + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(IndexPartitionInfoEntry); + ctl.hcxt = context; + + map->pdir_hash = hash_create("index partition directory", 256, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + MemoryContextSwitchTo(oldcontext); + + ScanKeyInit(&key, + Anum_pg_index_partitions_indexoid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationGetRelid(relation))); + + scan = systable_beginscan(rel, IndexPartitionsIndexId, true, + NULL, 1, &key); + + while ((tuple = systable_getnext(scan)) != NULL) + { + Form_pg_index_partitions form = (Form_pg_index_partitions) GETSTRUCT(tuple); + IndexPartitionInfoEntry *entry; + bool found; + + /* + * We need to consider the partition id of the detached partitioned as + * well while computing the maxpartid so that we do not repeat the + * value. + */ + if (form->partid > maxpartid) + maxpartid = form->partid; + + if (!OidIsValid(form->reloid)) + continue; + + entry = hash_search(map->pdir_hash, &form->partid, HASH_ENTER, &found); + Assert(!found); + entry->reloid = form->reloid; + } + + map->max_partid = maxpartid; + relation->rd_indexpartinfo = map; + systable_endscan(scan); + + table_close(rel, AccessShareLock); +} + +/* + * IndexGetRelationPartitionId - Get partition id for the reloid + * + * Get the partition ID for the given partition relation OID for the specified + * global index relation. + */ +PartitionId +IndexGetRelationPartitionId(Relation irel, Oid reloid) +{ + IndexPartitionInfo map; + HASH_SEQ_STATUS hash_seq; + PartitionId partid = InvalidPartitionId; + IndexPartitionInfoEntry *entry; + + if (irel->rd_indexpartinfo == NULL) + BuildIndexPartitionInfo(irel, CurrentMemoryContext); + + map = irel->rd_indexpartinfo; + + hash_seq_init(&hash_seq, map->pdir_hash); + + while ((entry = hash_seq_search(&hash_seq)) != NULL) + { + if (entry->reloid == reloid) + { + partid = entry->partid; + hash_seq_term(&hash_seq); + break; + } + } + + return partid; +} + +/* + * IndexGetPartitionReloid - Get relation oid for the paritionid + * + * Get the relation OID for the given partition ID for the specified global + * index relation. + */ +Oid +IndexGetPartitionReloid(Relation irel, PartitionId partid) +{ + IndexPartitionInfo map = irel->rd_indexpartinfo; + IndexPartitionInfoEntry *entry; + bool found; + + entry = hash_search(map->pdir_hash, &partid, HASH_FIND, &found); + if (!found) + return InvalidOid; + + return entry->reloid; +} + +/* + * InvalidateIndexPartitionEntries - Invalidate pg_index_partitions entries + * + * Set reloid as Invalid in pg_index_partitions entries with respect to the + * given reloid. If a valid global indexoids list is given then only + * invalidate the reloid entires which are related to the input global index + * oids. + */ +void +InvalidateIndexPartitionEntries(List *reloids, Oid indexoid) +{ + Relation catalogRelation; + SysScanDesc scan; + ScanKeyData key; + HeapTuple tuple; + + /* + * Find pg_inherits entries by inhparent. (We need to scan them all in + * order to verify that no other partition is pending detach.) + */ + catalogRelation = table_open(IndexPartitionsRelationId, RowExclusiveLock); + + ScanKeyInit(&key, + Anum_pg_index_partitions_indexoid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(indexoid)); + + scan = systable_beginscan(catalogRelation, IndexPartitionsIndexId, true, + NULL, 1, &key); + + while ((tuple = systable_getnext(scan)) != NULL) + { + Form_pg_index_partitions form = (Form_pg_index_partitions) GETSTRUCT(tuple); + HeapTuple newtup; + + if (!list_member_oid(reloids, form->reloid)) + continue; + + newtup = heap_copytuple(tuple); + ((Form_pg_index_partitions) GETSTRUCT(newtup))->reloid = InvalidOid; + + CatalogTupleUpdate(catalogRelation, + &tuple->t_self, + newtup); + heap_freetuple(newtup); + } + + /* Done */ + systable_endscan(scan); + table_close(catalogRelation, RowExclusiveLock); +} + +/* + * IndexGetNextPartitionID - Get the next partition ID of the global index + * + * Obtain the next partition ID to be allocated for the specified global index + * relation. Also update this value in the cache for the next allocation. + */ +PartitionId +IndexGetNextPartitionID(Relation irel) +{ + PartitionId partid; + + /* + * If the cache is not already build then do it first so that we know what + * is the maximum partition ID value and then we can generate the next + * value. + */ + if (irel->rd_indexpartinfo == NULL) + BuildIndexPartitionInfo(irel, CurrentMemoryContext); + + /* Use the max_partid + 1 value as the next parition id. */ + partid = irel->rd_indexpartinfo->max_partid + 1; + + /* + * If partitionID is wraparound then give error. + * XXX here we might consider reusing the unused partition IDs. + */ + if (!PartIdIsValid(partid)) + elog(ERROR, "could not allocate new PartitionID because limit is exhausted"); + + /* + * Store the new value in the cache, in case the cache is invalidated we + * will get the max value again from the system catalog, so there should + * not be any issue. + */ + irel->rd_indexpartinfo->max_partid = partid; + + return partid; +} diff --git a/src/include/c.h b/src/include/c.h index 8cdc16a0f4..797891abb6 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -636,6 +636,22 @@ typedef uint32 MultiXactOffset; typedef uint32 CommandId; +/* + * This is a new type used for global indexes. Global indexes store data + * from multiple partitions, so along with the heap TID, we also need a new + * identifier to identify the heap. We could store the reloid as well, but + * if the partition is detached, it becomes very hard to identify that we + * don't need to access data from this particular relation. It becomes even + * more problematic if the partition is dropped and later reloid is reused by + * another relation, making it difficult to distinguish whether a particular + * index tuple belongs to the old relation or the new relation. To handle + * this, we use a new identifier called partition id. Whenever a relation is + * detached from the global index, the partition id is invalidated, allowing + * us to easily identify that we don't need to access the tuple of this + * partition id. + */ +typedef uint32 PartitionId; + #define FirstCommandId ((CommandId) 0) #define InvalidCommandId (~(CommandId)0) diff --git a/src/include/catalog/Makefile b/src/include/catalog/Makefile index 2bbc7805fe..2b76dcdc16 100644 --- a/src/include/catalog/Makefile +++ b/src/include/catalog/Makefile @@ -81,7 +81,8 @@ CATALOG_HEADERS := \ pg_publication_namespace.h \ pg_publication_rel.h \ pg_subscription.h \ - pg_subscription_rel.h + pg_subscription_rel.h \ + pg_index_partitions.h GENERATED_HEADERS := $(CATALOG_HEADERS:%.h=%_d.h) diff --git a/src/include/catalog/pg_index_partitions.h b/src/include/catalog/pg_index_partitions.h new file mode 100644 index 0000000000..2dcc8ca3fc --- /dev/null +++ b/src/include/catalog/pg_index_partitions.h @@ -0,0 +1,84 @@ +/*------------------------------------------------------------------------- + * + * pg_index_partitions.h + * definition of the system catalog (pg_index_partitions) + * + * + * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/catalog/pg_index_partitions.h + * + * NOTES + * The Catalog.pm module reads this file and derives schema + * information. + * + *------------------------------------------------------------------------- + */ +#ifndef PG_INDEX_PARTITIONS_H +#define PG_INDEX_PARTITIONS_H + +#include "catalog/genbki.h" +#include "catalog/pg_attribute.h" +#include "catalog/pg_index_partitions_d.h" +#include "catalog/pg_type_d.h" +#include "utils/hsearch.h" +#include "utils/relcache.h" + +/* ---------------- + * pg_index_partitions definition. cpp turns this into + * typedef struct FormData_pg_index_partitions + * ---------------- + */ +CATALOG(pg_index_partitions,6015,IndexPartitionsRelationId) +{ + Oid indexoid BKI_LOOKUP(pg_class); + Oid reloid BKI_LOOKUP(pg_class); + int32 partid; +} FormData_pg_index_partitions; + +/* ---------------- + * Form_pg_index_partitions corresponds to a pointer to a tuple with + * the format of pg_index_partitions relation. + * ---------------- + */ +typedef FormData_pg_index_partitions *Form_pg_index_partitions; + +DECLARE_UNIQUE_INDEX_PKEY(pg_index_partitions_indexoid_partid_index, 6018, IndexPartitionsIndexId, pg_index_partitions, btree(indexoid oid_ops, partid int4_ops)); + +/* + * Map over the pg_index_partitions table for a particular global index. This + * will be used for faster lookup of the next partid to be used for this global + * index and also for finding out the partition relation for a give partid of + * a global index. + */ +typedef struct IndexPartitionInfoData +{ + MemoryContext context; /* memory context for storing the cache data */ + PartitionId max_partid; /* max value of used partid */ + HTAB *pdir_hash; /* partid to reloid lookup hash */ +} IndexPartitionInfoData; + +typedef IndexPartitionInfoData *IndexPartitionInfo; + +/* + * TODO we might think of storing the RelationDesc along with reloid? + */ +typedef struct IndexPartitionInfoEntry +{ + PartitionId partid; /* key */ + Oid reloid; /* payload */ +} IndexPartitionInfoEntry; + +#define InvalidPartitionId 0 +#define FirstValidPartitionId 1 +#define PartIdIsValid(partid) ((bool) ((partid) != InvalidPartitionId)) + +extern void BuildIndexPartitionInfo(Relation relation, MemoryContext context); +extern PartitionId IndexGetRelationPartitionId(Relation irel, Oid reloid); +extern Oid IndexGetPartitionReloid(Relation irel, PartitionId partid); +extern PartitionId IndexGetNextPartitionID(Relation irel); +extern void DeleteIndexPartitionEntries(Oid indrelid); +extern void InsertIndexPartitionEntry(Relation irel, Oid reloid, PartitionId partid); +extern void InvalidateIndexPartitionEntries(List *reloids, Oid indexoid); +#endif /* PG_INDEX_PARTITIONS_H */ diff --git a/src/include/postgres.h b/src/include/postgres.h index 8a41a66868..a0a24e2bca 100644 --- a/src/include/postgres.h +++ b/src/include/postgres.h @@ -536,6 +536,25 @@ Float8GetDatum(float8 X) extern Datum Float8GetDatum(float8 X); #endif +/* + * DatumGetPartitionId + * Returns partition identifier value of a datum. + */ +static inline PartitionId +DatumGetPartitionId(Datum X) +{ + return (PartitionId) X; +} + +/* + * PartitionIdGetDatum + * Returns datum representation for a partition identifier. + */ +static inline Datum +PartitionIdGetDatum(PartitionId X) +{ + return (Datum) X; +} /* * Int64GetDatumFast diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index b552359915..35270fdc05 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -19,6 +19,7 @@ #include "catalog/catalog.h" #include "catalog/pg_class.h" #include "catalog/pg_index.h" +#include "catalog/pg_index_partitions.h" #include "catalog/pg_publication.h" #include "nodes/bitmapset.h" #include "partitioning/partdefs.h" @@ -217,6 +218,12 @@ typedef struct RelationData Oid *rd_indcollation; /* OIDs of index collations */ bytea **rd_opcoptions; /* parsed opclass-specific options */ + /* + * Cache for mapping reloid to partition id for the global index. For more + * details refer comments in pg_index_partitions.h. + */ + IndexPartitionInfo rd_indexpartinfo; + /* * rd_amcache is available for index and table AMs to cache private data * about the relation. This must be just a cache since it may get reset diff --git a/src/test/regress/expected/oidjoins.out b/src/test/regress/expected/oidjoins.out index 215eb899be..7588e23d6a 100644 --- a/src/test/regress/expected/oidjoins.out +++ b/src/test/regress/expected/oidjoins.out @@ -266,3 +266,5 @@ NOTICE: checking pg_subscription {subdbid} => pg_database {oid} NOTICE: checking pg_subscription {subowner} => pg_authid {oid} NOTICE: checking pg_subscription_rel {srsubid} => pg_subscription {oid} NOTICE: checking pg_subscription_rel {srrelid} => pg_class {oid} +NOTICE: checking pg_index_partitions {indexoid} => pg_class {oid} +NOTICE: checking pg_index_partitions {reloid} => pg_class {oid} -- 2.49.0