From 0b162ec9707a2790f92e2e237cb4fa807157c19c Mon Sep 17 00:00:00 2001 From: Rishu Bagga Date: Thu, 15 Sep 2022 00:55:25 +0000 Subject: [PATCH] slru to buffercache with page headers, and upgrade logic --- contrib/amcheck/verify_nbtree.c | 2 +- contrib/bloom/blinsert.c | 6 +- contrib/pg_prewarm/autoprewarm.c | 2 +- contrib/pg_prewarm/pg_prewarm.c | 4 +- contrib/pg_visibility/pg_visibility.c | 5 +- src/backend/access/gist/gistbuild.c | 11 +- src/backend/access/hash/hashpage.c | 2 +- src/backend/access/heap/heapam_handler.c | 32 +- src/backend/access/heap/rewriteheap.c | 6 +- src/backend/access/heap/visibilitymap.c | 41 +- src/backend/access/nbtree/nbtree.c | 6 +- src/backend/access/nbtree/nbtsort.c | 8 +- src/backend/access/spgist/spginsert.c | 17 +- src/backend/access/table/tableam.c | 4 +- src/backend/access/transam/clog.c | 225 +-- src/backend/access/transam/commit_ts.c | 199 +-- src/backend/access/transam/multixact.c | 508 +++---- src/backend/access/transam/slru.c | 1484 ++----------------- src/backend/access/transam/subtrans.c | 114 +- src/backend/access/transam/xact.c | 2 + src/backend/access/transam/xlog.c | 15 +- src/backend/access/transam/xlogprefetcher.c | 28 +- src/backend/access/transam/xlogutils.c | 17 +- src/backend/catalog/catalog.c | 53 +- src/backend/catalog/index.c | 4 +- src/backend/catalog/storage.c | 205 ++- src/backend/commands/async.c | 97 +- src/backend/commands/dbcommands.c | 8 +- src/backend/commands/sequence.c | 8 +- src/backend/commands/tablecmds.c | 19 +- src/backend/storage/buffer/buf_init.c | 17 +- src/backend/storage/buffer/bufmgr.c | 379 ++--- src/backend/storage/buffer/localbuf.c | 25 +- src/backend/storage/freespace/freespace.c | 41 +- src/backend/storage/ipc/ipci.c | 4 - src/backend/storage/lmgr/predicate.c | 72 +- src/backend/storage/page/bufpage.c | 23 +- src/backend/storage/smgr/md.c | 366 ++--- src/backend/storage/smgr/smgr.c | 399 +++-- src/backend/storage/sync/sync.c | 20 +- src/backend/utils/cache/inval.c | 17 +- src/backend/utils/cache/relcache.c | 27 +- src/backend/utils/mmgr/mcxt.c | 13 +- src/bin/pg_upgrade/file.c | 175 ++- src/bin/pg_upgrade/function.c | 66 + src/bin/pg_upgrade/pg_upgrade.c | 76 +- src/bin/pg_upgrade/pg_upgrade.h | 19 +- src/common/relpath.c | 31 +- src/include/access/clog.h | 6 - src/include/access/commit_ts.h | 3 - src/include/access/multixact.h | 3 - src/include/access/slru.h | 181 +-- src/include/access/slrudefs.h | 19 + src/include/access/slrulist.h | 30 + src/include/access/subtrans.h | 3 - src/include/catalog/catversion.h | 4 +- src/include/catalog/storage.h | 11 +- src/include/common/relpath.h | 15 +- src/include/storage/buf_internals.h | 11 +- src/include/storage/bufmgr.h | 38 +- src/include/storage/bufpage.h | 17 + src/include/storage/md.h | 28 +- src/include/storage/relfilelocator.h | 29 +- src/include/storage/smgr.h | 99 +- src/include/utils/inval.h | 2 +- src/include/utils/rel.h | 33 +- 66 files changed, 2175 insertions(+), 3259 deletions(-) create mode 100644 src/include/access/slrudefs.h create mode 100644 src/include/access/slrulist.h diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 9021d156eb7..2722f121219 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -324,7 +324,7 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed, bool heapkeyspace, allequalimage; - if (!smgrexists(RelationGetSmgr(indrel), MAIN_FORKNUM)) + if (!smgrexists(RelationGetSmgr(indrel, MAIN_FORKNUM))) ereport(ERROR, (errcode(ERRCODE_INDEX_CORRUPTED), errmsg("index \"%s\" lacks a main relation fork", diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c index dd26d6ac29a..d068a243e8f 100644 --- a/contrib/bloom/blinsert.c +++ b/contrib/bloom/blinsert.c @@ -177,9 +177,9 @@ blbuildempty(Relation index) * this even when wal_level=minimal. */ PageSetChecksumInplace(metapage, BLOOM_METAPAGE_BLKNO); - smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, BLOOM_METAPAGE_BLKNO, + smgrwrite(RelationGetSmgr(index, INIT_FORKNUM), BLOOM_METAPAGE_BLKNO, (char *) metapage, true); - log_newpage(&(RelationGetSmgr(index))->smgr_rlocator.locator, INIT_FORKNUM, + log_newpage(&index->rd_locator, INIT_FORKNUM, BLOOM_METAPAGE_BLKNO, metapage, true); /* @@ -187,7 +187,7 @@ blbuildempty(Relation index) * write did not go through shared_buffers and therefore a concurrent * checkpoint may have moved the redo pointer past our xlog record. */ - smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM); + smgrimmedsync(RelationGetSmgr(index, INIT_FORKNUM)); } /* diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index d02a6a1ba04..de07bc593c7 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -535,7 +535,7 @@ autoprewarm_database_main(Datum main_arg) */ if (blk->forknum > InvalidForkNumber && blk->forknum <= MAX_FORKNUM && - smgrexists(RelationGetSmgr(rel), blk->forknum)) + smgrexists(RelationGetSmgr(rel, blk->forknum))) nblocks = RelationGetNumberOfBlocksInFork(rel, blk->forknum); else nblocks = 0; diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index caff5c4a80f..1e57481ef92 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -109,7 +109,7 @@ pg_prewarm(PG_FUNCTION_ARGS) aclcheck_error(aclresult, get_relkind_objtype(rel->rd_rel->relkind), get_rel_name(relOid)); /* Check that the fork exists. */ - if (!smgrexists(RelationGetSmgr(rel), forkNumber)) + if (!smgrexists(RelationGetSmgr(rel, forkNumber))) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("fork \"%s\" does not exist for this relation", @@ -177,7 +177,7 @@ pg_prewarm(PG_FUNCTION_ARGS) for (block = first_block; block <= last_block; ++block) { CHECK_FOR_INTERRUPTS(); - smgrread(RelationGetSmgr(rel), forkNumber, block, blockbuffer.data); + smgrread(RelationGetSmgr(rel, forkNumber), block, blockbuffer.data); ++blocks_done; } } diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index a95f73ec796..38cb56c9bce 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -388,13 +388,14 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS) check_relation_relkind(rel); /* Forcibly reset cached file size */ - RelationGetSmgr(rel)->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber; + RelationGetSmgr(rel, VISIBILITYMAP_FORKNUM)->smgr_cached_nblocks = InvalidBlockNumber; block = visibilitymap_prepare_truncate(rel, 0); if (BlockNumberIsValid(block)) { fork = VISIBILITYMAP_FORKNUM; - smgrtruncate(RelationGetSmgr(rel), &fork, 1, &block); + DropRelationBuffers(rel->rd_locator, rel->rd_backend, &fork, 1, &block); + smgrtruncate_multi(rel->rd_locator, rel->rd_backend, &fork, 1, &block); } if (RelationNeedsWAL(rel)) diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index fb0f466708c..a78527769e6 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -416,7 +416,7 @@ gist_indexsortbuild(GISTBuildState *state) * replaced with the real root page at the end. */ page = palloc0(BLCKSZ); - smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, + smgrextend(RelationGetSmgr(state->indexrel, MAIN_FORKNUM), GIST_ROOT_BLKNO, page, true); state->pages_allocated++; state->pages_written++; @@ -460,7 +460,7 @@ gist_indexsortbuild(GISTBuildState *state) /* Write out the root */ PageSetLSN(levelstate->pages[0], GistBuildLSN); PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO); - smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, + smgrwrite(RelationGetSmgr(state->indexrel, MAIN_FORKNUM), GIST_ROOT_BLKNO, levelstate->pages[0], true); if (RelationNeedsWAL(state->indexrel)) log_newpage(&state->indexrel->rd_locator, MAIN_FORKNUM, GIST_ROOT_BLKNO, @@ -479,7 +479,7 @@ gist_indexsortbuild(GISTBuildState *state) * still not be on disk when the crash occurs. */ if (RelationNeedsWAL(state->indexrel)) - smgrimmedsync(RelationGetSmgr(state->indexrel), MAIN_FORKNUM); + smgrimmedsync(RelationGetSmgr(state->indexrel, MAIN_FORKNUM)); } /* @@ -657,7 +657,7 @@ gist_indexsortbuild_flush_ready_pages(GISTBuildState *state) PageSetLSN(page, GistBuildLSN); PageSetChecksumInplace(page, blkno); - smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page, + smgrextend(RelationGetSmgr(state->indexrel, MAIN_FORKNUM), blkno, page, true); state->pages_written++; @@ -943,8 +943,7 @@ gistBuildCallback(Relation index, */ if ((buildstate->buildMode == GIST_BUFFERING_AUTO && buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 && - effective_cache_size < smgrnblocks(RelationGetSmgr(index), - MAIN_FORKNUM)) || + effective_cache_size < smgrnblocks(RelationGetSmgr(index, MAIN_FORKNUM))) || (buildstate->buildMode == GIST_BUFFERING_STATS && buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET)) { diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 55b2929ad51..4aa87d475d5 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -1030,7 +1030,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) true); PageSetChecksumInplace(page, lastblock); - smgrextend(RelationGetSmgr(rel), MAIN_FORKNUM, lastblock, zerobuf.data, + smgrextend(RelationGetSmgr(rel, MAIN_FORKNUM), lastblock, zerobuf.data, false); return true; diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ab1bcf3522d..0601d4735fb 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -572,7 +572,7 @@ heapam_relation_set_new_filelocator(Relation rel, TransactionId *freezeXid, MultiXactId *minmulti) { - SMgrRelation srel; + SMgrFileHandle sfile; /* * Initialize to the minimum XID that could put tuples in the table. We @@ -591,7 +591,7 @@ heapam_relation_set_new_filelocator(Relation rel, */ *minmulti = GetOldestMultiXactId(); - srel = RelationCreateStorage(*newrlocator, persistence, true); + sfile = RelationCreateStorage(*newrlocator, persistence, true); /* * If required, set up an init fork for an unlogged table so that it can @@ -604,15 +604,18 @@ heapam_relation_set_new_filelocator(Relation rel, */ if (persistence == RELPERSISTENCE_UNLOGGED) { + SMgrFileHandle sfile_init; + Assert(rel->rd_rel->relkind == RELKIND_RELATION || rel->rd_rel->relkind == RELKIND_MATVIEW || rel->rd_rel->relkind == RELKIND_TOASTVALUE); - smgrcreate(srel, INIT_FORKNUM, false); + sfile_init = smgropen(*newrlocator, InvalidBackendId, INIT_FORKNUM); + smgrcreate(sfile_init, false); log_smgrcreate(newrlocator, INIT_FORKNUM); - smgrimmedsync(srel, INIT_FORKNUM); + smgrimmedsync(sfile); } - smgrclose(srel); + smgrclose(sfile); } static void @@ -624,9 +627,7 @@ heapam_relation_nontransactional_truncate(Relation rel) static void heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) { - SMgrRelation dstrel; - - dstrel = smgropen(*newrlocator, rel->rd_backend); + SMgrFileHandle dstmain; /* * Since we copy the file directly without looking at the shared buffers, @@ -646,16 +647,21 @@ heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) RelationCreateStorage(*newrlocator, rel->rd_rel->relpersistence, true); /* copy main fork */ - RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM, + dstmain = smgropen(*newrlocator, rel->rd_backend, MAIN_FORKNUM); + RelationCopyStorage(RelationGetSmgr(rel, MAIN_FORKNUM), dstmain, rel->rd_rel->relpersistence); /* copy those extra forks that exist */ for (ForkNumber forkNum = MAIN_FORKNUM + 1; forkNum <= MAX_FORKNUM; forkNum++) { - if (smgrexists(RelationGetSmgr(rel), forkNum)) + SMgrFileHandle src_fork = RelationGetSmgr(rel, forkNum); + + if (smgrexists(src_fork)) { - smgrcreate(dstrel, forkNum, false); + SMgrFileHandle dst_fork = smgropen(*newrlocator, rel->rd_backend, forkNum); + + smgrcreate(dst_fork, false); /* * WAL log creation if the relation is persistent, or this is the @@ -665,7 +671,7 @@ heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && forkNum == INIT_FORKNUM)) log_smgrcreate(newrlocator, forkNum); - RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum, + RelationCopyStorage(RelationGetSmgr(rel, forkNum), dst_fork, rel->rd_rel->relpersistence); } } @@ -673,7 +679,7 @@ heapam_relation_copy_data(Relation rel, const RelFileLocator *newrlocator) /* drop old relation, and close new one */ RelationDropStorage(rel); - smgrclose(dstrel); + smgrclose(dstmain); } static void diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 2fe9e48e500..0eeef205a04 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -325,7 +325,7 @@ end_heap_rewrite(RewriteState state) PageSetChecksumInplace(state->rs_buffer, state->rs_blockno); - smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, + smgrextend(RelationGetSmgr(state->rs_new_rel, MAIN_FORKNUM), state->rs_blockno, (char *) state->rs_buffer, true); } @@ -337,7 +337,7 @@ end_heap_rewrite(RewriteState state) * wrote before the checkpoint. */ if (RelationNeedsWAL(state->rs_new_rel)) - smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM); + smgrimmedsync(RelationGetSmgr(state->rs_new_rel, MAIN_FORKNUM)); logical_end_heap_rewrite(state); @@ -691,7 +691,7 @@ raw_heap_insert(RewriteState state, HeapTuple tup) */ PageSetChecksumInplace(page, state->rs_blockno); - smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, + smgrextend(RelationGetSmgr(state->rs_new_rel, MAIN_FORKNUM), state->rs_blockno, (char *) page, true); state->rs_blockno++; diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 4ed70275e22..f48416d1e06 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -465,7 +465,7 @@ visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks) * If no visibility map has been created yet for this relation, there's * nothing to truncate. */ - if (!smgrexists(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM)) + if (!smgrexists(RelationGetSmgr(rel, VISIBILITYMAP_FORKNUM))) return InvalidBlockNumber; /* @@ -532,7 +532,7 @@ visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks) else newnblocks = truncBlock; - if (smgrnblocks(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM) <= newnblocks) + if (smgrnblocks(RelationGetSmgr(rel, VISIBILITYMAP_FORKNUM)) <= newnblocks) { /* nothing to do, the file was already smaller than requested size */ return InvalidBlockNumber; @@ -551,29 +551,29 @@ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend) { Buffer buf; - SMgrRelation reln; + SMgrFileHandle vm_sfile; /* * Caution: re-using this smgr pointer could fail if the relcache entry * gets closed. It's safe as long as we only do smgr-level operations * between here and the last use of the pointer. */ - reln = RelationGetSmgr(rel); + vm_sfile = RelationGetSmgr(rel, VISIBILITYMAP_FORKNUM); /* * If we haven't cached the size of the visibility map fork yet, check it * first. */ - if (reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber) + if (vm_sfile->smgr_cached_nblocks == InvalidBlockNumber) { - if (smgrexists(reln, VISIBILITYMAP_FORKNUM)) - smgrnblocks(reln, VISIBILITYMAP_FORKNUM); + if (smgrexists(vm_sfile)) + smgrnblocks(vm_sfile); else - reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = 0; + vm_sfile->smgr_cached_nblocks = 0; } /* Handle requests beyond EOF */ - if (blkno >= reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM]) + if (blkno >= vm_sfile->smgr_cached_nblocks) { if (extend) vm_extend(rel, blkno + 1); @@ -600,8 +600,7 @@ vm_readbuf(Relation rel, BlockNumber blkno, bool extend) * long as it doesn't depend on the page header having correct contents. * Current usage is safe because PageGetContents() does not require that. */ - buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, - RBM_ZERO_ON_ERROR, NULL); + buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); if (PageIsNew(BufferGetPage(buf))) { LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); @@ -621,7 +620,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) { BlockNumber vm_nblocks_now; PGAlignedBlock pg; - SMgrRelation reln; + SMgrFileHandle vm_sfile; PageInit((Page) pg.data, BLCKSZ, 0); @@ -642,27 +641,27 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) * gets closed. It's safe as long as we only do smgr-level operations * between here and the last use of the pointer. */ - reln = RelationGetSmgr(rel); + vm_sfile = RelationGetSmgr(rel, VISIBILITYMAP_FORKNUM); /* * Create the file first if it doesn't exist. If smgr_vm_nblocks is * positive then it must exist, no need for an smgrexists call. */ - if ((reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == 0 || - reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber) && - !smgrexists(reln, VISIBILITYMAP_FORKNUM)) - smgrcreate(reln, VISIBILITYMAP_FORKNUM, false); + if ((vm_sfile->smgr_cached_nblocks == 0 || + vm_sfile->smgr_cached_nblocks == InvalidBlockNumber) && + !smgrexists(vm_sfile)) + smgrcreate(vm_sfile, false); /* Invalidate cache so that smgrnblocks() asks the kernel. */ - reln->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber; - vm_nblocks_now = smgrnblocks(reln, VISIBILITYMAP_FORKNUM); + vm_sfile->smgr_cached_nblocks = InvalidBlockNumber; + vm_nblocks_now = smgrnblocks(vm_sfile); /* Now extend the file */ while (vm_nblocks_now < vm_nblocks) { PageSetChecksumInplace((Page) pg.data, vm_nblocks_now); - smgrextend(reln, VISIBILITYMAP_FORKNUM, vm_nblocks_now, pg.data, false); + smgrextend(vm_sfile, vm_nblocks_now, pg.data, false); vm_nblocks_now++; } @@ -673,7 +672,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) * to keep checking for creation or extension of the file, which happens * infrequently. */ - CacheInvalidateSmgr(reln->smgr_rlocator); + CacheInvalidateSmgr(rel->rd_locator, rel->rd_backend); UnlockRelationForExtension(rel, ExclusiveLock); } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index b52eca8f38b..644431a7e3c 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -164,9 +164,9 @@ btbuildempty(Relation index) * this even when wal_level=minimal. */ PageSetChecksumInplace(metapage, BTREE_METAPAGE); - smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, BTREE_METAPAGE, + smgrwrite(RelationGetSmgr(index, INIT_FORKNUM), BTREE_METAPAGE, (char *) metapage, true); - log_newpage(&RelationGetSmgr(index)->smgr_rlocator.locator, INIT_FORKNUM, + log_newpage(&index->rd_locator, INIT_FORKNUM, BTREE_METAPAGE, metapage, true); /* @@ -174,7 +174,7 @@ btbuildempty(Relation index) * write did not go through shared_buffers and therefore a concurrent * checkpoint may have moved the redo pointer past our xlog record. */ - smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM); + smgrimmedsync(RelationGetSmgr(index, INIT_FORKNUM)); } /* diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 501e011ce1e..87bb9f52b4d 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -662,7 +662,7 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) if (!wstate->btws_zeropage) wstate->btws_zeropage = (Page) palloc0(BLCKSZ); /* don't set checksum for all-zero page */ - smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, + smgrextend(RelationGetSmgr(wstate->index, MAIN_FORKNUM), wstate->btws_pages_written++, (char *) wstate->btws_zeropage, true); @@ -677,14 +677,14 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) if (blkno == wstate->btws_pages_written) { /* extending the file... */ - smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, + smgrextend(RelationGetSmgr(wstate->index, MAIN_FORKNUM), blkno, (char *) page, true); wstate->btws_pages_written++; } else { /* overwriting a block we zero-filled before */ - smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, + smgrwrite(RelationGetSmgr(wstate->index, MAIN_FORKNUM), blkno, (char *) page, true); } @@ -1431,7 +1431,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) * still not be on disk when the crash occurs. */ if (wstate->btws_use_wal) - smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM); + smgrimmedsync(RelationGetSmgr(wstate->index, MAIN_FORKNUM)); } /* diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index c6821b59524..75eb09543d9 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -155,8 +155,11 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) void spgbuildempty(Relation index) { + SMgrFileHandle sfile; Page page; + sfile = RelationGetSmgr(index, INIT_FORKNUM); + /* Construct metapage. */ page = (Page) palloc(BLCKSZ); SpGistInitMetapage(page); @@ -169,27 +172,27 @@ spgbuildempty(Relation index) * replayed. */ PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO); - smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, + smgrwrite(sfile, SPGIST_METAPAGE_BLKNO, (char *) page, true); - log_newpage(&(RelationGetSmgr(index))->smgr_rlocator.locator, INIT_FORKNUM, + log_newpage(&index->rd_locator, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO, page, true); /* Likewise for the root page. */ SpGistInitPage(page, SPGIST_LEAF); PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO); - smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_ROOT_BLKNO, + smgrwrite(sfile, SPGIST_ROOT_BLKNO, (char *) page, true); - log_newpage(&(RelationGetSmgr(index))->smgr_rlocator.locator, INIT_FORKNUM, + log_newpage(&index->rd_locator, INIT_FORKNUM, SPGIST_ROOT_BLKNO, page, true); /* Likewise for the null-tuples root page. */ SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS); PageSetChecksumInplace(page, SPGIST_NULL_BLKNO); - smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_NULL_BLKNO, + smgrwrite(sfile, SPGIST_NULL_BLKNO, (char *) page, true); - log_newpage(&(RelationGetSmgr(index))->smgr_rlocator.locator, INIT_FORKNUM, + log_newpage(&index->rd_locator, INIT_FORKNUM, SPGIST_NULL_BLKNO, page, true); /* @@ -197,7 +200,7 @@ spgbuildempty(Relation index) * writes did not go through shared buffers and therefore a concurrent * checkpoint may have moved the redo pointer past our xlog record. */ - smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM); + smgrimmedsync(sfile); } /* diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 094b24c7c9c..5bc2c3726f1 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -631,10 +631,10 @@ table_block_relation_size(Relation rel, ForkNumber forkNumber) if (forkNumber == InvalidForkNumber) { for (int i = 0; i < MAX_FORKNUM; i++) - nblocks += smgrnblocks(RelationGetSmgr(rel), i); + nblocks += smgrnblocks(RelationGetSmgr(rel, i)); } else - nblocks = smgrnblocks(RelationGetSmgr(rel), forkNumber); + nblocks = smgrnblocks(RelationGetSmgr(rel, forkNumber)); return nblocks * BLCKSZ; } diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 77d9894dab3..b6af01fcaa1 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -41,6 +41,8 @@ #include "miscadmin.h" #include "pg_trace.h" #include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/buf_internals.h" #include "storage/proc.h" #include "storage/sync.h" @@ -59,7 +61,7 @@ /* We need two bits per xact, so four xacts fit in a byte */ #define CLOG_BITS_PER_XACT 2 #define CLOG_XACTS_PER_BYTE 4 -#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE) +#define CLOG_XACTS_PER_PAGE ((BLCKSZ - SizeOfPageHeaderData) * CLOG_XACTS_PER_BYTE) #define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1) #define TransactionIdToPage(xid) ((xid) / (TransactionId) CLOG_XACTS_PER_PAGE) @@ -81,17 +83,10 @@ */ #define THRESHOLD_SUBTRANS_CLOG_OPT 5 -/* - * Link to shared-memory data structures for CLOG control - */ -static SlruCtlData XactCtlData; - -#define XactCtl (&XactCtlData) - -static int ZeroCLOGPage(int pageno, bool writeXlog); +static Buffer ZeroCLOGPage(int pageno, bool writeXlog); static bool CLOGPagePrecedes(int page1, int page2); -static void WriteZeroPageXlogRec(int pageno); +static XLogRecPtr WriteZeroPageXlogRec(int pageno); static void WriteTruncateXlogRec(int pageno, TransactionId oldestXact, Oid oldestXactDb); static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, @@ -99,7 +94,7 @@ static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, XLogRecPtr lsn, int pageno, bool all_xact_same_page); static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status, - XLogRecPtr lsn, int slotno); + XLogRecPtr lsn, Buffer buffer); static void set_status_by_pages(int nsubxids, TransactionId *subxids, XidStatus status, XLogRecPtr lsn); static bool TransactionGroupUpdateXidStatus(TransactionId xid, @@ -339,13 +334,12 @@ TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, TransactionId *subxids, XidStatus status, XLogRecPtr lsn, int pageno) { - int slotno; + Buffer buffer; int i; Assert(status == TRANSACTION_STATUS_COMMITTED || status == TRANSACTION_STATUS_ABORTED || (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid))); - Assert(LWLockHeldByMeInMode(XactSLRULock, LW_EXCLUSIVE)); /* * If we're doing an async commit (ie, lsn is valid), then we must wait @@ -356,7 +350,8 @@ TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, * write-busy, since we don't care if the update reaches disk sooner than * we think. */ - slotno = SimpleLruReadPage(XactCtl, pageno, XLogRecPtrIsInvalid(lsn), xid); + buffer = ReadSlruBuffer(SLRU_CLOG_ID, pageno, RBM_NORMAL); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); /* * Set the main transaction id, if any. @@ -374,25 +369,27 @@ TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, { for (i = 0; i < nsubxids; i++) { - Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); + Assert(pageno == TransactionIdToPage(subxids[i])); TransactionIdSetStatusBit(subxids[i], TRANSACTION_STATUS_SUB_COMMITTED, - lsn, slotno); + lsn, buffer); } } /* ... then the main transaction */ - TransactionIdSetStatusBit(xid, status, lsn, slotno); + TransactionIdSetStatusBit(xid, status, lsn, buffer); } /* Set the subtransactions */ for (i = 0; i < nsubxids; i++) { - Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); - TransactionIdSetStatusBit(subxids[i], status, lsn, slotno); + Assert(pageno == TransactionIdToPage(subxids[i])); + TransactionIdSetStatusBit(subxids[i], status, lsn, buffer); } - XactCtl->shared->page_dirty[slotno] = true; + + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); } /* @@ -566,7 +563,7 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, * Must be called with XactSLRULock held */ static void -TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno) +TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, Buffer buffer) { int byteno = TransactionIdToByte(xid); int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; @@ -574,7 +571,10 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i char byteval; char curval; - byteptr = XactCtl->shared->page_buffer[slotno] + byteno; + Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(GetBufferDescriptor(buffer - 1)), + LW_EXCLUSIVE)); + + byteptr = PageGetContents(BufferGetPage(buffer)) + byteno; curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK; /* @@ -602,8 +602,9 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i byteval |= (status << bshift); *byteptr = byteval; + /* - * Update the group LSN if the transaction completion LSN is higher. + * Update the buffer LSN if the transaction completion LSN is higher. * * Note: lsn will be invalid when supplied during InRecovery processing, * so we don't need to do anything special to avoid LSN updates during @@ -612,10 +613,8 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i */ if (!XLogRecPtrIsInvalid(lsn)) { - int lsnindex = GetLSNIndex(slotno, xid); - - if (XactCtl->shared->group_lsn[lsnindex] < lsn) - XactCtl->shared->group_lsn[lsnindex] = lsn; + if (PageGetLSN(BufferGetPage(buffer)) < lsn) + PageSetLSN(BufferGetPage(buffer), lsn); } } @@ -640,67 +639,21 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn) int pageno = TransactionIdToPage(xid); int byteno = TransactionIdToByte(xid); int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; - int slotno; - int lsnindex; char *byteptr; XidStatus status; + Buffer buffer; - /* lock is acquired by SimpleLruReadPage_ReadOnly */ - - slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid); - byteptr = XactCtl->shared->page_buffer[slotno] + byteno; + buffer = ReadSlruBuffer(SLRU_CLOG_ID, pageno, RBM_NORMAL); + byteptr = PageGetContents(BufferGetPage(buffer)) + byteno; status = (*byteptr >> bshift) & CLOG_XACT_BITMASK; + *lsn = PageGetLSN(BufferGetPage(buffer)); - lsnindex = GetLSNIndex(slotno, xid); - *lsn = XactCtl->shared->group_lsn[lsnindex]; - - LWLockRelease(XactSLRULock); + ReleaseBuffer(buffer); return status; } -/* - * Number of shared CLOG buffers. - * - * On larger multi-processor systems, it is possible to have many CLOG page - * requests in flight at one time which could lead to disk access for CLOG - * page if the required page is not found in memory. Testing revealed that we - * can get the best performance by having 128 CLOG buffers, more than that it - * doesn't improve performance. - * - * Unconditionally keeping the number of CLOG buffers to 128 did not seem like - * a good idea, because it would increase the minimum amount of shared memory - * required to start, which could be a problem for people running very small - * configurations. The following formula seems to represent a reasonable - * compromise: people with very low values for shared_buffers will get fewer - * CLOG buffers as well, and everyone else will get 128. - */ -Size -CLOGShmemBuffers(void) -{ - return Min(128, Max(4, NBuffers / 512)); -} - -/* - * Initialization of shared memory for CLOG - */ -Size -CLOGShmemSize(void) -{ - return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE); -} - -void -CLOGShmemInit(void) -{ - XactCtl->PagePrecedes = CLOGPagePrecedes; - SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE, - XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER, - SYNC_HANDLER_CLOG); - SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE); -} - /* * This func must be called ONCE on system install. It creates * the initial CLOG segment. (The CLOG directory is assumed to @@ -710,18 +663,15 @@ CLOGShmemInit(void) void BootStrapCLOG(void) { - int slotno; - - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); + Buffer buffer; /* Create and zero the first page of the commit log */ - slotno = ZeroCLOGPage(0, false); + buffer = ZeroCLOGPage(0, false); /* Make sure it's written out */ - SimpleLruWritePage(XactCtl, slotno); - Assert(!XactCtl->shared->page_dirty[slotno]); + FlushOneBuffer(buffer); - LWLockRelease(XactSLRULock); + UnlockReleaseBuffer(buffer); } /* @@ -733,17 +683,29 @@ BootStrapCLOG(void) * * Control lock must be held at entry, and will be held at exit. */ -static int +static Buffer ZeroCLOGPage(int pageno, bool writeXlog) { - int slotno; + Buffer buffer; + Page page; + XLogRecPtr lsn; + + buffer = ZeroSlruBuffer(SLRU_CLOG_ID, pageno); + + page = BufferGetPage(buffer); - slotno = SimpleLruZeroPage(XactCtl, pageno); + PageInitSLRU(page, BLCKSZ, 0); + lsn = 0; if (writeXlog) - WriteZeroPageXlogRec(pageno); + { + lsn = WriteZeroPageXlogRec(pageno); + } + + PageSetHeaderDataNonRel(page, pageno, lsn, BLCKSZ, PG_METAPAGE_LAYOUT_VERSION); + MarkBufferDirty(buffer); - return slotno; + return buffer; } /* @@ -753,17 +715,6 @@ ZeroCLOGPage(int pageno, bool writeXlog) void StartupCLOG(void) { - TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); - int pageno = TransactionIdToPage(xid); - - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); - - /* - * Initialize our idea of the latest page number. - */ - XactCtl->shared->latest_page_number = pageno; - - LWLockRelease(XactSLRULock); } /* @@ -775,8 +726,6 @@ TrimCLOG(void) TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); int pageno = TransactionIdToPage(xid); - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); - /* * Zero out the remainder of the current clog page. Under normal * circumstances it should be zeroes already, but it seems at least @@ -793,40 +742,25 @@ TrimCLOG(void) { int byteno = TransactionIdToByte(xid); int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; - int slotno; char *byteptr; + Buffer buffer; - slotno = SimpleLruReadPage(XactCtl, pageno, false, xid); - byteptr = XactCtl->shared->page_buffer[slotno] + byteno; + buffer = ReadSlruBuffer(SLRU_CLOG_ID, pageno, RBM_TRIM); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + byteptr = PageGetContents(BufferGetPage(buffer)) + byteno; /* Zero so-far-unused positions in the current byte */ *byteptr &= (1 << bshift) - 1; /* Zero the rest of the page */ MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1); - XactCtl->shared->page_dirty[slotno] = true; - } - LWLockRelease(XactSLRULock); -} + MarkBufferDirty(buffer); -/* - * Perform a checkpoint --- either during shutdown, or on-the-fly - */ -void -CheckPointCLOG(void) -{ - /* - * Write dirty CLOG pages to disk. This may result in sync requests - * queued for later handling by ProcessSyncRequests(), as part of the - * checkpoint. - */ - TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true); - SimpleLruWriteAll(XactCtl, true); - TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true); + UnlockReleaseBuffer(buffer); + } } - /* * Make sure that CLOG has room for a newly-allocated XID. * @@ -850,12 +784,8 @@ ExtendCLOG(TransactionId newestXact) pageno = TransactionIdToPage(newestXact); - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroCLOGPage(pageno, true); - - LWLockRelease(XactSLRULock); + UnlockReleaseBuffer(ZeroCLOGPage(pageno, true)); } @@ -886,7 +816,8 @@ TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid) cutoffPage = TransactionIdToPage(oldestXact); /* Check to see if there's any files that could be removed */ - if (!SlruScanDirectory(XactCtl, SlruScanDirCbReportPresence, &cutoffPage)) + if (!SlruScanDirectory(SLRU_CLOG_ID, CLOGPagePrecedes, + SlruScanDirCbReportPresence, &cutoffPage)) return; /* nothing to remove */ /* @@ -907,7 +838,7 @@ TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid) WriteTruncateXlogRec(cutoffPage, oldestXact, oldestxid_datoid); /* Now we can remove the old CLOG segment(s) */ - SimpleLruTruncate(XactCtl, cutoffPage); + SimpleLruTruncate(SLRU_CLOG_ID, CLOGPagePrecedes, cutoffPage); } @@ -948,12 +879,15 @@ CLOGPagePrecedes(int page1, int page2) /* * Write a ZEROPAGE xlog record */ -static void +static XLogRecPtr WriteZeroPageXlogRec(int pageno) { + XLogRecPtr lsn; XLogBeginInsert(); XLogRegisterData((char *) (&pageno), sizeof(int)); - (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE); + lsn = XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE); + + return lsn; } /* @@ -992,17 +926,13 @@ clog_redo(XLogReaderState *record) if (info == CLOG_ZEROPAGE) { int pageno; - int slotno; + Buffer buffer; memcpy(&pageno, XLogRecGetData(record), sizeof(int)); - LWLockAcquire(XactSLRULock, LW_EXCLUSIVE); - - slotno = ZeroCLOGPage(pageno, false); - SimpleLruWritePage(XactCtl, slotno); - Assert(!XactCtl->shared->page_dirty[slotno]); - - LWLockRelease(XactSLRULock); + buffer = ZeroCLOGPage(pageno, false); + FlushOneBuffer(buffer); + UnlockReleaseBuffer(buffer); } else if (info == CLOG_TRUNCATE) { @@ -1012,17 +942,8 @@ clog_redo(XLogReaderState *record) AdvanceOldestClogXid(xlrec.oldestXact); - SimpleLruTruncate(XactCtl, xlrec.pageno); + SimpleLruTruncate(SLRU_CLOG_ID, CLOGPagePrecedes, xlrec.pageno); } else elog(PANIC, "clog_redo: unknown op code %u", info); } - -/* - * Entrypoint for sync.c to sync clog files. - */ -int -clogsyncfiletag(const FileTag *ftag, char *path) -{ - return SlruSyncFileTag(XactCtl, ftag, path); -} diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 9aa4675cb79..af88abd5c84 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -63,19 +63,14 @@ typedef struct CommitTimestampEntry sizeof(RepOriginId)) #define COMMIT_TS_XACTS_PER_PAGE \ - (BLCKSZ / SizeOfCommitTimestampEntry) + ((BLCKSZ - SizeOfPageHeaderData) / SizeOfCommitTimestampEntry) #define TransactionIdToCTsPage(xid) \ ((xid) / (TransactionId) COMMIT_TS_XACTS_PER_PAGE) #define TransactionIdToCTsEntry(xid) \ ((xid) % (TransactionId) COMMIT_TS_XACTS_PER_PAGE) -/* - * Link to shared-memory data structures for CommitTs control - */ -static SlruCtlData CommitTsCtlData; -#define CommitTsCtl (&CommitTsCtlData) /* * We keep a cache of the last value set in shared memory. @@ -107,11 +102,12 @@ static void SetXidCommitTsInPage(TransactionId xid, int nsubxids, static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, RepOriginId nodeid, int slotno); static void error_commit_ts_disabled(void); -static int ZeroCommitTsPage(int pageno, bool writeXlog); +static Buffer ZeroCommitTsPage(int pageno, bool writeXlog); static bool CommitTsPagePrecedes(int page1, int page2); static void ActivateCommitTs(void); static void DeactivateCommitTs(void); -static void WriteZeroPageXlogRec(int pageno); +static XLogRecPtr WriteZeroPageXlogRec(int pageno); + static void WriteTruncateXlogRec(int pageno, TransactionId oldestXid); /* @@ -216,32 +212,31 @@ SetXidCommitTsInPage(TransactionId xid, int nsubxids, TransactionId *subxids, TimestampTz ts, RepOriginId nodeid, int pageno) { - int slotno; int i; + Buffer buffer; - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); + buffer = ReadSlruBuffer(SLRU_COMMIT_TS_ID, pageno, RBM_NORMAL); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); - slotno = SimpleLruReadPage(CommitTsCtl, pageno, true, xid); - - TransactionIdSetCommitTs(xid, ts, nodeid, slotno); + TransactionIdSetCommitTs(xid, ts, nodeid, buffer); for (i = 0; i < nsubxids; i++) - TransactionIdSetCommitTs(subxids[i], ts, nodeid, slotno); + TransactionIdSetCommitTs(subxids[i], ts, nodeid, buffer); - CommitTsCtl->shared->page_dirty[slotno] = true; + MarkBufferDirty(buffer); - LWLockRelease(CommitTsSLRULock); + UnlockReleaseBuffer(buffer); } /* * Sets the commit timestamp of a single transaction. - * - * Must be called with CommitTsSLRULock held */ static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, - RepOriginId nodeid, int slotno) + RepOriginId nodeid, Buffer buffer) { int entryno = TransactionIdToCTsEntry(xid); + int pageno = TransactionIdToCTsPage(xid); + CommitTimestampEntry entry; Assert(TransactionIdIsNormal(xid)); @@ -249,9 +244,12 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts, entry.time = ts; entry.nodeid = nodeid; - memcpy(CommitTsCtl->shared->page_buffer[slotno] + + Assert(xid == pageno * COMMIT_TS_XACTS_PER_PAGE + entryno); + + memcpy(PageGetContents(BufferGetPage(buffer)) + \ SizeOfCommitTimestampEntry * entryno, &entry, SizeOfCommitTimestampEntry); + } /* @@ -268,10 +266,10 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts, { int pageno = TransactionIdToCTsPage(xid); int entryno = TransactionIdToCTsEntry(xid); - int slotno; CommitTimestampEntry entry; TransactionId oldestCommitTsXid; TransactionId newestCommitTsXid; + Buffer buffer; if (!TransactionIdIsValid(xid)) ereport(ERROR, @@ -325,18 +323,19 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts, return false; } - /* lock is acquired by SimpleLruReadPage_ReadOnly */ - slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid); + buffer = ReadSlruBuffer(SLRU_COMMIT_TS_ID, pageno, RBM_NORMAL); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + memcpy(&entry, - CommitTsCtl->shared->page_buffer[slotno] + - SizeOfCommitTimestampEntry * entryno, - SizeOfCommitTimestampEntry); + PageGetContents(BufferGetPage(buffer)) + \ + SizeOfCommitTimestampEntry * entryno, + SizeOfCommitTimestampEntry); *ts = entry.time; if (nodeid) *nodeid = entry.nodeid; - LWLockRelease(CommitTsSLRULock); + UnlockReleaseBuffer(buffer); return *ts != 0; } @@ -505,27 +504,13 @@ pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS) PG_RETURN_DATUM(HeapTupleGetDatum(htup)); } -/* - * Number of shared CommitTS buffers. - * - * We use a very similar logic as for the number of CLOG buffers (except we - * scale up twice as fast with shared buffers, and the maximum is twice as - * high); see comments in CLOGShmemBuffers. - */ -Size -CommitTsShmemBuffers(void) -{ - return Min(256, Max(4, NBuffers / 256)); -} - /* * Shared memory sizing for CommitTs */ Size CommitTsShmemSize(void) { - return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) + - sizeof(CommitTimestampShared); + return sizeof(CommitTimestampShared); } /* @@ -537,12 +522,7 @@ CommitTsShmemInit(void) { bool found; - CommitTsCtl->PagePrecedes = CommitTsPagePrecedes; - SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0, - CommitTsSLRULock, "pg_commit_ts", - LWTRANCHE_COMMITTS_BUFFER, - SYNC_HANDLER_COMMIT_TS); - SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE); + SlruPagePrecedesUnitTests(CommitTsPagePrecedes, COMMIT_TS_XACTS_PER_PAGE); commitTsShared = ShmemInitStruct("CommitTs shared", sizeof(CommitTimestampShared), @@ -586,17 +566,26 @@ BootStrapCommitTs(void) * * Control lock must be held at entry, and will be held at exit. */ -static int +static Buffer ZeroCommitTsPage(int pageno, bool writeXlog) { - int slotno; - - slotno = SimpleLruZeroPage(CommitTsCtl, pageno); - + Buffer buffer; + Page page; + XLogRecPtr lsn; + + buffer = ZeroSlruBuffer(SLRU_COMMIT_TS_ID, pageno); + + page = BufferGetPage(buffer); + PageInitSLRU(page, BLCKSZ, 0); + if (writeXlog) - WriteZeroPageXlogRec(pageno); + { + lsn = WriteZeroPageXlogRec(pageno); + PageSetHeaderDataNonRel(page, pageno, lsn, BLCKSZ, PG_METAPAGE_LAYOUT_VERSION); + } + MarkBufferDirty(buffer); - return slotno; + return buffer; } /* @@ -694,13 +683,6 @@ ActivateCommitTs(void) xid = XidFromFullTransactionId(ShmemVariableCache->nextXid); pageno = TransactionIdToCTsPage(xid); - /* - * Re-Initialize our idea of the latest page number. - */ - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); - CommitTsCtl->shared->latest_page_number = pageno; - LWLockRelease(CommitTsSLRULock); - /* * If CommitTs is enabled, but it wasn't in the previous server run, we * need to set the oldest and newest values to the next Xid; that way, we @@ -723,15 +705,19 @@ ActivateCommitTs(void) LWLockRelease(CommitTsLock); /* Create the current segment file, if necessary */ - if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno)) + if (!SimpleLruDoesPhysicalPageExist(SLRU_COMMIT_TS_ID, pageno)) { - int slotno; + Buffer buffer; + Page page; - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); - slotno = ZeroCommitTsPage(pageno, false); - SimpleLruWritePage(CommitTsCtl, slotno); - Assert(!CommitTsCtl->shared->page_dirty[slotno]); - LWLockRelease(CommitTsSLRULock); + buffer = ZeroSlruBuffer(SLRU_COMMIT_TS_ID, pageno); + + page = BufferGetPage(buffer); + PageInitSLRU(page, BLCKSZ, 0); + + MarkBufferDirty(buffer); + FlushOneBuffer(buffer); + UnlockReleaseBuffer(buffer); } /* Change the activation status in shared memory. */ @@ -780,23 +766,9 @@ DeactivateCommitTs(void) * be overwritten anyway when we wrap around, but it seems better to be * tidy.) */ - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); - (void) SlruScanDirectory(CommitTsCtl, SlruScanDirCbDeleteAll, NULL); - LWLockRelease(CommitTsSLRULock); -} - -/* - * Perform a checkpoint --- either during shutdown, or on-the-fly - */ -void -CheckPointCommitTs(void) -{ - /* - * Write dirty CommitTs pages to disk. This may result in sync requests - * queued for later handling by ProcessSyncRequests(), as part of the - * checkpoint. - */ - SimpleLruWriteAll(CommitTsCtl, true); + (void) SlruScanDirectory(SLRU_COMMIT_TS_ID, + CommitTsPagePrecedes, + SlruScanDirCbDeleteAll, NULL); } /* @@ -834,12 +806,8 @@ ExtendCommitTs(TransactionId newestXact) pageno = TransactionIdToCTsPage(newestXact); - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroCommitTsPage(pageno, !InRecovery); - - LWLockRelease(CommitTsSLRULock); + UnlockReleaseBuffer(ZeroCommitTsPage(pageno, !InRecovery)); } /* @@ -860,7 +828,9 @@ TruncateCommitTs(TransactionId oldestXact) cutoffPage = TransactionIdToCTsPage(oldestXact); /* Check to see if there's any files that could be removed */ - if (!SlruScanDirectory(CommitTsCtl, SlruScanDirCbReportPresence, + if (!SlruScanDirectory(SLRU_COMMIT_TS_ID, + CommitTsPagePrecedes, + SlruScanDirCbReportPresence, &cutoffPage)) return; /* nothing to remove */ @@ -868,7 +838,7 @@ TruncateCommitTs(TransactionId oldestXact) WriteTruncateXlogRec(cutoffPage, oldestXact); /* Now we can remove the old CommitTs segment(s) */ - SimpleLruTruncate(CommitTsCtl, cutoffPage); + SimpleLruTruncate(SLRU_COMMIT_TS_ID, CommitTsPagePrecedes, cutoffPage); } /* @@ -954,12 +924,16 @@ CommitTsPagePrecedes(int page1, int page2) /* * Write a ZEROPAGE xlog record */ -static void +static XLogRecPtr WriteZeroPageXlogRec(int pageno) { + XLogRecPtr lsn; + XLogBeginInsert(); XLogRegisterData((char *) (&pageno), sizeof(int)); - (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE); + lsn = XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE); + + return lsn; } /* @@ -992,17 +966,19 @@ commit_ts_redo(XLogReaderState *record) if (info == COMMIT_TS_ZEROPAGE) { int pageno; - int slotno; + Buffer buffer; + Page page; memcpy(&pageno, XLogRecGetData(record), sizeof(int)); - LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE); - - slotno = ZeroCommitTsPage(pageno, false); - SimpleLruWritePage(CommitTsCtl, slotno); - Assert(!CommitTsCtl->shared->page_dirty[slotno]); - - LWLockRelease(CommitTsSLRULock); + buffer = ZeroSlruBuffer(SLRU_COMMIT_TS_ID, pageno); + + page = BufferGetPage(buffer); + PageInitSLRU(page, BLCKSZ, 0); + + MarkBufferDirty(buffer); + FlushOneBuffer(buffer); + UnlockReleaseBuffer(buffer); } else if (info == COMMIT_TS_TRUNCATE) { @@ -1010,23 +986,8 @@ commit_ts_redo(XLogReaderState *record) AdvanceOldestCommitTsXid(trunc->oldestXid); - /* - * During XLOG replay, latest_page_number isn't set up yet; insert a - * suitable value to bypass the sanity test in SimpleLruTruncate. - */ - CommitTsCtl->shared->latest_page_number = trunc->pageno; - - SimpleLruTruncate(CommitTsCtl, trunc->pageno); + SimpleLruTruncate(SLRU_COMMIT_TS_ID, CommitTsPagePrecedes, trunc->pageno); } else elog(PANIC, "commit_ts_redo: unknown op code %u", info); } - -/* - * Entrypoint for sync.c to sync commit_ts files. - */ -int -committssyncfiletag(const FileTag *ftag, char *path) -{ - return SlruSyncFileTag(CommitTsCtl, ftag, path); -} diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index e1191a7564c..358c67fa65c 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -106,7 +106,7 @@ */ /* We need four bytes per offset */ -#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset)) +#define MULTIXACT_OFFSETS_PER_PAGE ((BLCKSZ - SizeOfPageHeaderData)/ sizeof(MultiXactOffset)) #define MultiXactIdToOffsetPage(xid) \ ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) @@ -138,7 +138,7 @@ /* size in bytes of a complete group */ #define MULTIXACT_MEMBERGROUP_SIZE \ (sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP) -#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +#define MULTIXACT_MEMBERGROUPS_PER_PAGE ((BLCKSZ - SizeOfPageHeaderData) / MULTIXACT_MEMBERGROUP_SIZE) #define MULTIXACT_MEMBERS_PER_PAGE \ (MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP) @@ -161,9 +161,9 @@ /* Location (byte offset within page) of flag word for a given member */ #define MXOffsetToFlagsOffset(xid) \ - ((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \ + (((((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) % \ (TransactionId) MULTIXACT_MEMBERGROUPS_PER_PAGE) * \ - (TransactionId) MULTIXACT_MEMBERGROUP_SIZE) + (TransactionId) MULTIXACT_MEMBERGROUP_SIZE)) #define MXOffsetToFlagsBitShift(xid) \ (((xid) % (TransactionId) MULTIXACT_MEMBERS_PER_MEMBERGROUP) * \ MXACT_MEMBER_BITS_PER_XACT) @@ -181,15 +181,6 @@ #define PreviousMultiXactId(xid) \ ((xid) == FirstMultiXactId ? MaxMultiXactId : (xid) - 1) -/* - * Links to shared-memory data structures for MultiXact control - */ -static SlruCtlData MultiXactOffsetCtlData; -static SlruCtlData MultiXactMemberCtlData; - -#define MultiXactOffsetCtl (&MultiXactOffsetCtlData) -#define MultiXactMemberCtl (&MultiXactMemberCtlData) - /* * MultiXact state shared across all backends. All this state is protected * by MultiXactGenLock. (We also use MultiXactOffsetSLRULock and @@ -339,8 +330,8 @@ static MemoryContext MXactContext = NULL; /* internal MultiXactId management */ static void MultiXactIdSetOldestVisible(void); static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, - int nmembers, MultiXactMember *members); -static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset); + int nmembers, MultiXactMember *members, Buffer * offset_buf_ptr, Buffer * member_bufs); +static MultiXactId GetNewMultiXactId(int nmembers, MultiXactOffset *offset, Buffer * offset_buf, Buffer ** member_bufs); /* MultiXact cache management */ static int mxactMemberComparator(const void *arg1, const void *arg2); @@ -352,19 +343,18 @@ static void mXactCachePut(MultiXactId multi, int nmembers, static char *mxstatus_to_string(MultiXactStatus status); /* management of SLRU infrastructure */ -static int ZeroMultiXactOffsetPage(int pageno, bool writeXlog); -static int ZeroMultiXactMemberPage(int pageno, bool writeXlog); +static Buffer ZeroMultiXactOffsetPage(int pageno, bool writeXlog); +static Buffer ZeroMultiXactMemberPage(int pageno, bool writeXlog); static bool MultiXactOffsetPagePrecedes(int page1, int page2); -static bool MultiXactMemberPagePrecedes(int page1, int page2); static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2); -static void ExtendMultiXactOffset(MultiXactId multi); -static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); +static void ExtendMultiXactOffset(MultiXactId multi, Buffer * buffer); +static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers, Buffer ** member_buffers); static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, uint32 distance); static bool SetOffsetVacuumLimit(bool is_startup); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); -static void WriteMZeroPageXlogRec(int pageno, uint8 info); +static XLogRecPtr WriteMZeroPageXlogRec(int pageno, uint8 info); static void WriteMTruncateXlogRec(Oid oldestMultiDB, MultiXactId startTruncOff, MultiXactId endTruncOff, @@ -765,6 +755,9 @@ ReadMultiXactIdRange(MultiXactId *oldest, MultiXactId *next) MultiXactId MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) { + Buffer * member_bufs; + Buffer offset_buff; + MultiXactId multi; MultiXactOffset offset; xl_multixact_create xlrec; @@ -818,7 +811,8 @@ MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) * in vacuum. During vacuum, in particular, it would be unacceptable to * keep OldestMulti set, in case it runs for long. */ - multi = GetNewMultiXactId(nmembers, &offset); + + multi = GetNewMultiXactId(nmembers, &offset, &offset_buff, &member_bufs); /* Make an XLOG entry describing the new MXID. */ xlrec.mid = multi; @@ -838,8 +832,8 @@ MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) (void) XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_CREATE_ID); /* Now enter the information into the OFFSETs and MEMBERs logs */ - RecordNewMultiXact(multi, offset, nmembers, members); - + RecordNewMultiXact(multi, offset, nmembers, members, &offset_buff, member_bufs); + /* Done with critical section */ END_CRIT_SECTION(); @@ -860,40 +854,38 @@ MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) */ static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, - int nmembers, MultiXactMember *members) + int nmembers, MultiXactMember *members, Buffer * offset_buf_ptr, Buffer * member_bufs) { int pageno; int prev_pageno; + int min_pageno; int entryno; - int slotno; MultiXactOffset *offptr; int i; - - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + Buffer buffer; + Buffer offset_buf; pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); - /* - * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction" - * to complain about if there's any I/O error. This is kinda bogus, but - * since the errors will always give the full pathname, it should be clear - * enough that a MultiXactId is really involved. Perhaps someday we'll - * take the trouble to generalize the slru.c error reporting code. - */ - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); - offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; - offptr += entryno; + if (offset_buf_ptr) + offset_buf = *offset_buf_ptr; + else + offset_buf = ReadSlruBuffer(SLRU_MULTIXACT_OFFSET_ID, pageno, RBM_NORMAL); - *offptr = offset; + LockBuffer(offset_buf, BUFFER_LOCK_EXCLUSIVE); + - MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + offptr = (MultiXactOffset *) PageGetContents(BufferGetPage(offset_buf)); + offptr += entryno; - /* Exchange our lock */ - LWLockRelease(MultiXactOffsetSLRULock); + *offptr = offset; - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + MarkBufferDirty(offset_buf); + UnlockReleaseBuffer(offset_buf); + buffer = InvalidBuffer; + min_pageno = MXOffsetToMemberPage(offset); prev_pageno = -1; for (i = 0; i < nmembers; i++, offset++) @@ -914,27 +906,35 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, if (pageno != prev_pageno) { - slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + if (member_bufs) + buffer = member_bufs[pageno - min_pageno]; + else + buffer = ReadSlruBuffer(SLRU_MULTIXACT_MEMBER_ID, pageno, RBM_NORMAL); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); prev_pageno = pageno; } - memberptr = (TransactionId *) - (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); + memberptr = (TransactionId *) (PageGetContents(BufferGetPage(buffer)) + memberoff); *memberptr = members[i].xid; - flagsptr = (uint32 *) - (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); + flagsptr = (uint32 *) (PageGetContents(BufferGetPage(buffer)) + flagsoff); flagsval = *flagsptr; flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); flagsval |= (members[i].status << bshift); *flagsptr = flagsval; - MultiXactMemberCtl->shared->page_dirty[slotno] = true; + MarkBufferDirty(buffer); } - LWLockRelease(MultiXactMemberSLRULock); + UnlockReleaseBuffer(buffer); + if (member_bufs != NULL) + pfree(member_bufs); } /* @@ -953,8 +953,11 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, * caller must end the critical section after writing SLRU data. */ static MultiXactId -GetNewMultiXactId(int nmembers, MultiXactOffset *offset) +GetNewMultiXactId(int nmembers, MultiXactOffset *offset, Buffer * offset_buf, Buffer ** member_bufs) { + int min_pageno; + int max_pageno; + MultiXactId result; MultiXactOffset nextOffset; @@ -1072,7 +1075,7 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) } /* Make sure there is room for the MXID in the file. */ - ExtendMultiXactOffset(result); + ExtendMultiXactOffset(result, offset_buf); /* * Reserve the members space, similarly to above. Also, be careful not to @@ -1160,7 +1163,12 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) MultiXactState->offsetStopLimit - nextOffset + nmembers), errhint("Execute a database-wide VACUUM in that database with reduced vacuum_multixact_freeze_min_age and vacuum_multixact_freeze_table_age settings."))); - ExtendMultiXactMember(nextOffset, nmembers); + min_pageno = MXOffsetToMemberPage(nextOffset); + max_pageno = MXOffsetToMemberPage(nextOffset + nmembers - 1); + + *member_bufs = (Buffer *) palloc(sizeof(Buffer) * (max_pageno - min_pageno + 1)); + + ExtendMultiXactMember(nextOffset, nmembers, member_bufs); /* * Critical section from here until caller has written the data into the @@ -1226,7 +1234,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, int pageno; int prev_pageno; int entryno; - int slotno; MultiXactOffset *offptr; MultiXactOffset offset; int length; @@ -1237,6 +1244,7 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, MultiXactId tmpMXact; MultiXactOffset nextOffset; MultiXactMember *ptr; + Buffer buffer; debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); @@ -1340,13 +1348,12 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * time on every multixact creation. */ retry: - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); - pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); - offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + buffer = ReadSlruBuffer(SLRU_MULTIXACT_OFFSET_ID, pageno, RBM_NORMAL); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + offptr = (MultiXactOffset *) (PageGetContents(BufferGetPage(buffer))); offptr += entryno; offset = *offptr; @@ -1377,16 +1384,20 @@ retry: entryno = MultiXactIdToOffsetEntry(tmpMXact); if (pageno != prev_pageno) - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact); + { + UnlockReleaseBuffer(buffer); + buffer = ReadSlruBuffer(SLRU_MULTIXACT_OFFSET_ID, pageno, RBM_NORMAL); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + } - offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + offptr = (MultiXactOffset *) (PageGetContents(BufferGetPage(buffer))); offptr += entryno; nextMXOffset = *offptr; if (nextMXOffset == 0) { /* Corner case 2: next multixact is still being filled in */ - LWLockRelease(MultiXactOffsetSLRULock); + UnlockReleaseBuffer(buffer); CHECK_FOR_INTERRUPTS(); pg_usleep(1000L); goto retry; @@ -1394,14 +1405,11 @@ retry: length = nextMXOffset - offset; } - - LWLockRelease(MultiXactOffsetSLRULock); + UnlockReleaseBuffer(buffer); + buffer = InvalidBuffer; ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); - /* Now get the members themselves. */ - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); - truelength = 0; prev_pageno = -1; for (i = 0; i < length; i++, offset++) @@ -1417,12 +1425,14 @@ retry: if (pageno != prev_pageno) { - slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, multi); + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + buffer = ReadSlruBuffer(SLRU_MULTIXACT_MEMBER_ID, pageno, RBM_NORMAL); + LockBuffer(buffer, BUFFER_LOCK_SHARE); prev_pageno = pageno; } - xactptr = (TransactionId *) - (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); + xactptr = (TransactionId *) (PageGetContents((BufferGetPage(buffer)) + memberoff)); if (!TransactionIdIsValid(*xactptr)) { @@ -1433,14 +1443,13 @@ retry: flagsoff = MXOffsetToFlagsOffset(offset); bshift = MXOffsetToFlagsBitShift(offset); - flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff); + flagsptr = (uint32 *) (PageGetContents(BufferGetPage(buffer)) + flagsoff); ptr[truelength].xid = *xactptr; ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK; truelength++; } - - LWLockRelease(MultiXactMemberSLRULock); + UnlockReleaseBuffer(buffer); /* A multixid with zero members should not happen */ Assert(truelength > 0); @@ -1832,8 +1841,6 @@ MultiXactShmemSize(void) mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot)) size = SHARED_MULTIXACT_STATE_SIZE; - size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0)); - size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0)); return size; } @@ -1845,22 +1852,6 @@ MultiXactShmemInit(void) debug_elog2(DEBUG2, "Shared Memory Init for MultiXact"); - MultiXactOffsetCtl->PagePrecedes = MultiXactOffsetPagePrecedes; - MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes; - - SimpleLruInit(MultiXactOffsetCtl, - "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0, - MultiXactOffsetSLRULock, "pg_multixact/offsets", - LWTRANCHE_MULTIXACTOFFSET_BUFFER, - SYNC_HANDLER_MULTIXACT_OFFSET); - SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE); - SimpleLruInit(MultiXactMemberCtl, - "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0, - MultiXactMemberSLRULock, "pg_multixact/members", - LWTRANCHE_MULTIXACTMEMBER_BUFFER, - SYNC_HANDLER_MULTIXACT_MEMBER); - /* doesn't call SimpleLruTruncate() or meet criteria for unit tests */ - /* Initialize our shared state struct */ MultiXactState = ShmemInitStruct("Shared MultiXact State", SHARED_MULTIXACT_STATE_SIZE, @@ -1891,29 +1882,17 @@ MultiXactShmemInit(void) void BootStrapMultiXact(void) { - int slotno; - - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + Buffer buffer; /* Create and zero the first page of the offsets log */ - slotno = ZeroMultiXactOffsetPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(MultiXactOffsetSLRULock); - - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); + buffer = ZeroMultiXactOffsetPage(0, false); + FlushOneBuffer(buffer); + UnlockReleaseBuffer(buffer); /* Create and zero the first page of the members log */ - slotno = ZeroMultiXactMemberPage(0, false); - - /* Make sure it's written out */ - SimpleLruWritePage(MultiXactMemberCtl, slotno); - Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - - LWLockRelease(MultiXactMemberSLRULock); + buffer = ZeroMultiXactMemberPage(0, false); + FlushOneBuffer(buffer); + UnlockReleaseBuffer(buffer); } /* @@ -1925,33 +1904,54 @@ BootStrapMultiXact(void) * * Control lock must be held at entry, and will be held at exit. */ -static int +static Buffer ZeroMultiXactOffsetPage(int pageno, bool writeXlog) { - int slotno; + Buffer buffer; + Page page; + XLogRecPtr recptr; - slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + buffer = ZeroSlruBuffer(SLRU_MULTIXACT_OFFSET_ID, pageno); + page = BufferGetPage(buffer); + PageInitSLRU(page, BLCKSZ, 0); + recptr = 0; + if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); + { + recptr = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE); + } + + PageSetHeaderDataNonRel(page, pageno, recptr, BLCKSZ, PG_METAPAGE_LAYOUT_VERSION); + MarkBufferDirty(buffer); - return slotno; + return buffer; } /* * Ditto, for MultiXactMember */ -static int +static Buffer ZeroMultiXactMemberPage(int pageno, bool writeXlog) { - int slotno; - - slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno); - + Buffer buffer; + Page page; + XLogRecPtr recptr; + + buffer = ZeroSlruBuffer(SLRU_MULTIXACT_MEMBER_ID, pageno); + page = BufferGetPage(buffer); + PageInitSLRU(page, BLCKSZ, 0); + + recptr = 0; if (writeXlog) - WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); - - return slotno; + { + recptr = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE); + } + + PageSetHeaderDataNonRel(page, pageno, recptr, BLCKSZ, PG_METAPAGE_LAYOUT_VERSION); + MarkBufferDirty(buffer); + + return buffer; } /* @@ -1976,22 +1976,14 @@ MaybeExtendOffsetSlru(void) pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact); - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); - - if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) + if (!SimpleLruDoesPhysicalPageExist(SLRU_MULTIXACT_OFFSET_ID, pageno)) { - int slotno; + Buffer buffer; - /* - * Fortunately for us, SimpleLruWritePage is already prepared to deal - * with creating a new segment file even if the page we're writing is - * not the first in it, so this is enough. - */ - slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); + buffer = ZeroMultiXactOffsetPage(pageno, false); + FlushOneBuffer(buffer); + UnlockReleaseBuffer(buffer); } - - LWLockRelease(MultiXactOffsetSLRULock); } /* @@ -2005,21 +1997,6 @@ MaybeExtendOffsetSlru(void) void StartupMultiXact(void) { - MultiXactId multi = MultiXactState->nextMXact; - MultiXactOffset offset = MultiXactState->nextOffset; - int pageno; - - /* - * Initialize offset's idea of the latest page number. - */ - pageno = MultiXactIdToOffsetPage(multi); - MultiXactOffsetCtl->shared->latest_page_number = pageno; - - /* - * Initialize member's idea of the latest page number. - */ - pageno = MXOffsetToMemberPage(offset); - MultiXactMemberCtl->shared->latest_page_number = pageno; } /* @@ -2043,14 +2020,8 @@ TrimMultiXact(void) oldestMXactDB = MultiXactState->oldestMultiXactDB; LWLockRelease(MultiXactGenLock); - /* Clean up offsets state */ - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); - - /* - * (Re-)Initialize our idea of the latest page number for offsets. - */ pageno = MultiXactIdToOffsetPage(nextMXact); - MultiXactOffsetCtl->shared->latest_page_number = pageno; + /* * Zero out the remainder of the current offsets page. See notes in @@ -2063,46 +2034,39 @@ TrimMultiXact(void) entryno = MultiXactIdToOffsetEntry(nextMXact); if (entryno != 0) { - int slotno; MultiXactOffset *offptr; + Buffer buffer; - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); - offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + buffer = ReadSlruBuffer(SLRU_MULTIXACT_OFFSET_ID, pageno, RBM_TRIM); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + offptr = (MultiXactOffset *) (PageGetContents(BufferGetPage(buffer))); offptr += entryno; - MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); + MemSet(offptr, 0, BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - (entryno * sizeof(MultiXactOffset))); - MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); } - LWLockRelease(MultiXactOffsetSLRULock); - - /* And the same for members */ - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); - - /* - * (Re-)Initialize our idea of the latest page number for members. - */ - pageno = MXOffsetToMemberPage(offset); - MultiXactMemberCtl->shared->latest_page_number = pageno; - /* * Zero out the remainder of the current members page. See notes in * TrimCLOG() for motivation. */ + + pageno = MXOffsetToMemberPage(offset); flagsoff = MXOffsetToFlagsOffset(offset); if (flagsoff != 0) { - int slotno; TransactionId *xidptr; int memberoff; + Buffer buffer; memberoff = MXOffsetToMemberOffset(offset); - slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset); - xidptr = (TransactionId *) - (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff); + buffer = ReadSlruBuffer(SLRU_MULTIXACT_MEMBER_ID, pageno, RBM_TRIM); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + xidptr = (TransactionId *) (PageGetContents(BufferGetPage(buffer)) + memberoff); - MemSet(xidptr, 0, BLCKSZ - memberoff); + MemSet(xidptr, 0, BLCKSZ - memberoff - MAXALIGN(SizeOfPageHeaderData)); /* * Note: we don't need to zero out the flag bits in the remaining @@ -2110,11 +2074,10 @@ TrimMultiXact(void) * writing. */ - MultiXactMemberCtl->shared->page_dirty[slotno] = true; + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); } - LWLockRelease(MultiXactMemberSLRULock); - /* signal that we're officially up */ LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); MultiXactState->finishedStartup = true; @@ -2146,25 +2109,6 @@ MultiXactGetCheckptMulti(bool is_shutdown, *nextMulti, *nextMultiOffset, *oldestMulti, *oldestMultiDB); } -/* - * Perform a checkpoint --- either during shutdown, or on-the-fly - */ -void -CheckPointMultiXact(void) -{ - TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true); - - /* - * Write dirty MultiXact pages to disk. This may result in sync requests - * queued for later handling by ProcessSyncRequests(), as part of the - * checkpoint. - */ - SimpleLruWriteAll(MultiXactOffsetCtl, true); - SimpleLruWriteAll(MultiXactMemberCtl, true); - - TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true); -} - /* * Set the next-to-be-assigned MultiXactId and offset * @@ -2399,26 +2343,30 @@ MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) * room in shared memory. */ static void -ExtendMultiXactOffset(MultiXactId multi) +ExtendMultiXactOffset(MultiXactId multi, Buffer * buffer) { int pageno; + /* - * No work except at first MultiXactId of a page. But beware: just after - * wraparound, the first MultiXactId of page zero is FirstMultiXactId. + * Make a ReadBuffer call for the page we need beforehand so that we don't need + * to malloc later. + * If we're at the first MultiXactId of a page, make sure we also zero the page */ + + pageno = MultiXactIdToOffsetPage(multi); if (MultiXactIdToOffsetEntry(multi) != 0 && multi != FirstMultiXactId) + { + /* make a read buffer call to enlarge the resource owner */ + *buffer = ReadSlruBuffer(SLRU_MULTIXACT_OFFSET_ID, pageno, RBM_NORMAL); return; - - pageno = MultiXactIdToOffsetPage(multi); - - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); - - /* Zero the page and make an XLOG entry about it */ - ZeroMultiXactOffsetPage(pageno, true); - - LWLockRelease(MultiXactOffsetSLRULock); + } else + { + /* Zero the page and make an XLOG entry about it */ + *buffer = ZeroMultiXactOffsetPage(pageno, true); + LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); /* release lock but don't unpin */ + } } /* @@ -2429,7 +2377,7 @@ ExtendMultiXactOffset(MultiXactId multi) * same comments apply. */ static void -ExtendMultiXactMember(MultiXactOffset offset, int nmembers) +ExtendMultiXactMember(MultiXactOffset offset, int nmembers, Buffer ** buffers) { /* * It's possible that the members span more than one page of the members @@ -2437,10 +2385,17 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) * optimal if the members span several pages, but that seems unusual * enough to not worry much about. */ + int min_pageno; + + min_pageno = MXOffsetToMemberPage(offset); while (nmembers > 0) { + Buffer buf; + int flagsoff; int flagsbit; + int pageno; + uint32 difference; /* @@ -2448,20 +2403,24 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) */ flagsoff = MXOffsetToFlagsOffset(offset); flagsbit = MXOffsetToFlagsBitShift(offset); + pageno = MXOffsetToMemberPage(offset); + + + if (flagsoff == 0 && flagsbit == 0) { - int pageno; - - pageno = MXOffsetToMemberPage(offset); - - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); - /* Zero the page and make an XLOG entry about it */ - ZeroMultiXactMemberPage(pageno, true); - - LWLockRelease(MultiXactMemberSLRULock); + buf = ZeroMultiXactMemberPage(pageno, true); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } else + { + /* do a read buffer call to allocate space beforehand */ + buf = ReadSlruBuffer(SLRU_MULTIXACT_MEMBER_ID, pageno, RBM_NORMAL); } + if (buffers) + (*buffers)[pageno - min_pageno] = buf; + /* * Compute the number of items till end of current page. Careful: if * addition of unsigned ints wraps around, we're at the last page of @@ -2734,8 +2693,8 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) MultiXactOffset offset; int pageno; int entryno; - int slotno; MultiXactOffset *offptr; + Buffer buffer; Assert(MultiXactState->finishedStartup); @@ -2743,20 +2702,19 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) entryno = MultiXactIdToOffsetEntry(multi); /* - * Write out dirty data, so PhysicalPageExists can work correctly. + * Cope with missing/bogus oldest MultiXact in inconsistent states (see + * commit 068cfadf9). */ - SimpleLruWriteAll(MultiXactOffsetCtl, true); - SimpleLruWriteAll(MultiXactMemberCtl, true); - - if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) + if (!ProbeSlruBuffer(SLRU_MULTIXACT_OFFSET_ID, pageno) && + !SimpleLruDoesPhysicalPageExist(SLRU_MULTIXACT_OFFSET_ID, pageno)) return false; - /* lock is acquired by SimpleLruReadPage_ReadOnly */ - slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi); - offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + buffer = ReadSlruBuffer(SLRU_MULTIXACT_OFFSET_ID, pageno, RBM_NORMAL); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + offptr = (MultiXactOffset *) (PageGetContents(BufferGetPage(buffer))); offptr += entryno; offset = *offptr; - LWLockRelease(MultiXactOffsetSLRULock); + UnlockReleaseBuffer(buffer); *result = offset; return true; @@ -2863,12 +2821,13 @@ typedef struct mxtruncinfo * This callback determines the earliest existing page number. */ static bool -SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data) +SlruScanDirCbFindEarliest(int slru_id, SlruPagePrecedesFunction PagePrecedes, + char *filename, int segpage, void *data) { mxtruncinfo *trunc = (mxtruncinfo *) data; if (trunc->earliestExistingPage == -1 || - ctl->PagePrecedes(segpage, trunc->earliestExistingPage)) + PagePrecedes(segpage, trunc->earliestExistingPage)) { trunc->earliestExistingPage = segpage; } @@ -2900,7 +2859,7 @@ PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldest while (segment != endsegment) { elog(DEBUG2, "truncating multixact members segment %x", segment); - SlruDeleteSegment(MultiXactMemberCtl, segment); + SlruDeleteSegment(SLRU_MULTIXACT_MEMBER_ID, segment); /* move to next segment, handling wraparound correctly */ if (segment == maxsegment) @@ -2923,7 +2882,8 @@ PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti) * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound * detection. */ - SimpleLruTruncate(MultiXactOffsetCtl, + SimpleLruTruncate(SLRU_MULTIXACT_OFFSET_ID, + MultiXactOffsetPagePrecedes, MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti))); } @@ -2997,7 +2957,9 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB) * been truncated away, and we crashed before updating oldestMulti. */ trunc.earliestExistingPage = -1; - SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc); + SlruScanDirectory(SLRU_MULTIXACT_OFFSET_ID, + MultiXactOffsetPagePrecedes, + SlruScanDirCbFindEarliest, &trunc); earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE; if (earliest < FirstMultiXactId) earliest = FirstMultiXactId; @@ -3129,24 +3091,6 @@ MultiXactOffsetPagePrecedes(int page1, int page2) multi2 + MULTIXACT_OFFSETS_PER_PAGE - 1)); } -/* - * Decide whether a MultiXactMember page number is "older" for truncation - * purposes. There is no "invalid offset number" so use the numbers verbatim. - */ -static bool -MultiXactMemberPagePrecedes(int page1, int page2) -{ - MultiXactOffset offset1; - MultiXactOffset offset2; - - offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE; - offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE; - - return (MultiXactOffsetPrecedes(offset1, offset2) && - MultiXactOffsetPrecedes(offset1, - offset2 + MULTIXACT_MEMBERS_PER_PAGE - 1)); -} - /* * Decide which of two MultiXactIds is earlier. * @@ -3191,12 +3135,16 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2) * Write an xlog record reflecting the zeroing of either a MEMBERs or * OFFSETs page (info shows which) */ -static void +static XLogRecPtr WriteMZeroPageXlogRec(int pageno, uint8 info) { + XLogRecPtr recptr; + XLogBeginInsert(); XLogRegisterData((char *) (&pageno), sizeof(int)); - (void) XLogInsert(RM_MULTIXACT_ID, info); + recptr = XLogInsert(RM_MULTIXACT_ID, info); + + return recptr; } /* @@ -3241,32 +3189,18 @@ multixact_redo(XLogReaderState *record) if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE) { int pageno; - int slotno; memcpy(&pageno, XLogRecGetData(record), sizeof(int)); - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(MultiXactOffsetSLRULock); + UnlockReleaseBuffer(ZeroMultiXactOffsetPage(pageno, false)); } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { int pageno; - int slotno; memcpy(&pageno, XLogRecGetData(record), sizeof(int)); - LWLockAcquire(MultiXactMemberSLRULock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactMemberPage(pageno, false); - SimpleLruWritePage(MultiXactMemberCtl, slotno); - Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]); - - LWLockRelease(MultiXactMemberSLRULock); + UnlockReleaseBuffer(ZeroMultiXactMemberPage(pageno, false)); } else if (info == XLOG_MULTIXACT_CREATE_ID) { @@ -3277,7 +3211,7 @@ multixact_redo(XLogReaderState *record) /* Store the data back into the SLRU files */ RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, - xlrec->members); + xlrec->members, NULL, NULL); /* Make sure nextMXact/nextOffset are beyond what this record has */ MultiXactAdvanceNextMXact(xlrec->mid + 1, @@ -3300,7 +3234,6 @@ multixact_redo(XLogReaderState *record) else if (info == XLOG_MULTIXACT_TRUNCATE_ID) { xl_multixact_truncate xlrec; - int pageno; memcpy(&xlrec, XLogRecGetData(record), SizeOfMultiXactTruncate); @@ -3326,13 +3259,6 @@ multixact_redo(XLogReaderState *record) PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); - /* - * During XLOG replay, latest_page_number isn't necessarily set up - * yet; insert a suitable value to bypass the sanity test in - * SimpleLruTruncate. - */ - pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff); - MultiXactOffsetCtl->shared->latest_page_number = pageno; PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff); LWLockRelease(MultiXactTruncationLock); @@ -3405,21 +3331,3 @@ pg_get_multixact_members(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funccxt); } - -/* - * Entrypoint for sync.c to sync offsets files. - */ -int -multixactoffsetssyncfiletag(const FileTag *ftag, char *path) -{ - return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path); -} - -/* - * Entrypoint for sync.c to sync members files. - */ -int -multixactmemberssyncfiletag(const FileTag *ftag, char *path) -{ - return SlruSyncFileTag(MultiXactMemberCtl, ftag, path); -} diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 6feda87f574..2d8445e1307 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -1,41 +1,9 @@ /*------------------------------------------------------------------------- * * slru.c - * Simple LRU buffering for transaction status logfiles + * Simple buffering for transaction status logfiles * - * We use a simple least-recently-used scheme to manage a pool of page - * buffers. Under ordinary circumstances we expect that write - * traffic will occur mostly to the latest page (and to the just-prior - * page, soon after a page transition). Read traffic will probably touch - * a larger span of pages, but in any case a fairly small number of page - * buffers should be sufficient. So, we just search the buffers using plain - * linear search; there's no need for a hashtable or anything fancy. - * The management algorithm is straight LRU except that we will never swap - * out the latest page (since we know it's going to be hit again eventually). - * - * We use a control LWLock to protect the shared data structures, plus - * per-buffer LWLocks that synchronize I/O for each buffer. The control lock - * must be held to examine or modify any shared state. A process that is - * reading in or writing out a page buffer does not hold the control lock, - * only the per-buffer lock for the buffer it is working on. - * - * "Holding the control lock" means exclusive lock in all cases except for - * SimpleLruReadPage_ReadOnly(); see comments for SlruRecentlyUsed() for - * the implications of that. - * - * When initiating I/O on a buffer, we acquire the per-buffer lock exclusively - * before releasing the control lock. The per-buffer lock is released after - * completing the I/O, re-acquiring the control lock, and updating the shared - * state. (Deadlock is not possible here, because we never try to initiate - * I/O when someone else is already doing I/O on the same buffer.) - * To wait for I/O to complete, release the control lock, acquire the - * per-buffer lock in shared mode, immediately release the per-buffer lock, - * reacquire the control lock, and then recheck state (since arbitrary things - * could have happened while we didn't have the lock). - * - * As with the regular buffer manager, it is possible for another process - * to re-dirty a page that is currently being written out. This is handled - * by re-setting the page's page_dirty flag. + * XXX write me * * * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group @@ -60,562 +28,34 @@ #include "storage/fd.h" #include "storage/shmem.h" -#define SlruFileName(ctl, path, seg) \ - snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) - /* - * During SimpleLruWriteAll(), we will usually not need to write more than one - * or two physical files, but we may need to write several pages per file. We - * can consolidate the I/O requests by leaving files open until control returns - * to SimpleLruWriteAll(). This data structure remembers which files are open. + * SLRU ID to path mapping */ -#define MAX_WRITEALL_BUFFERS 16 +#define PG_SLRU(symname,name,path,synchronize) \ + path, -typedef struct SlruWriteAllData +static char *slru_dirs[] = { - int num_files; /* # files actually open */ - int fd[MAX_WRITEALL_BUFFERS]; /* their FD's */ - int segno[MAX_WRITEALL_BUFFERS]; /* their log seg#s */ -} SlruWriteAllData; - -typedef struct SlruWriteAllData *SlruWriteAll; +#include "access/slrulist.h" +}; /* - * Populate a file tag describing a segment file. We only use the segment - * number, since we can derive everything else we need by having separate - * sync handler functions for clog, multixact etc. + * We'll maintain a little cache of recently seen buffers, to try to avoid the + * buffer mapping table on repeat access (ie the busy end of the CLOG). One + * entry per SLRU. */ -#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \ -( \ - memset(&(a), 0, sizeof(FileTag)), \ - (a).handler = (xx_handler), \ - (a).segno = (xx_segno) \ -) - -/* - * Macro to mark a buffer slot "most recently used". Note multiple evaluation - * of arguments! - * - * The reason for the if-test is that there are often many consecutive - * accesses to the same page (particularly the latest page). By suppressing - * useless increments of cur_lru_count, we reduce the probability that old - * pages' counts will "wrap around" and make them appear recently used. - * - * We allow this code to be executed concurrently by multiple processes within - * SimpleLruReadPage_ReadOnly(). As long as int reads and writes are atomic, - * this should not cause any completely-bogus values to enter the computation. - * However, it is possible for either cur_lru_count or individual - * page_lru_count entries to be "reset" to lower values than they should have, - * in case a process is delayed while it executes this macro. With care in - * SlruSelectLRUPage(), this does little harm, and in any case the absolute - * worst possible consequence is a nonoptimal choice of page to evict. The - * gain from allowing concurrent reads of SLRU pages seems worth it. - */ -#define SlruRecentlyUsed(shared, slotno) \ - do { \ - int new_lru_count = (shared)->cur_lru_count; \ - if (new_lru_count != (shared)->page_lru_count[slotno]) { \ - (shared)->cur_lru_count = ++new_lru_count; \ - (shared)->page_lru_count[slotno] = new_lru_count; \ - } \ - } while (0) - -/* Saved info for SlruReportIOError */ -typedef enum -{ - SLRU_OPEN_FAILED, - SLRU_SEEK_FAILED, - SLRU_READ_FAILED, - SLRU_WRITE_FAILED, - SLRU_FSYNC_FAILED, - SLRU_CLOSE_FAILED -} SlruErrorCause; +struct SlruRecentBuffer { + int pageno; + Buffer recent_buffer; +}; -static SlruErrorCause slru_errcause; -static int slru_errno; +static struct SlruRecentBuffer slru_recent_buffers[SLRU_NEXT_ID]; - -static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno); -static void SimpleLruWaitIO(SlruCtl ctl, int slotno); -static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata); -static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno); -static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, - SlruWriteAll fdata); -static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid); -static int SlruSelectLRUPage(SlruCtl ctl, int pageno); - -static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, +static bool SlruScanDirCbDeleteCutoff(int slru_id, + SlruPagePrecedesFunction PagePrecedes, + char *filename, int segpage, void *data); -static void SlruInternalDeleteSegment(SlruCtl ctl, int segno); - -/* - * Initialization of shared memory - */ - -Size -SimpleLruShmemSize(int nslots, int nlsns) -{ - Size sz; - - /* we assume nslots isn't so large as to risk overflow */ - sz = MAXALIGN(sizeof(SlruSharedData)); - sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */ - sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */ - sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */ - sz += MAXALIGN(nslots * sizeof(int)); /* page_number[] */ - sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */ - sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */ - - if (nlsns > 0) - sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */ - - return BUFFERALIGN(sz) + BLCKSZ * nslots; -} - -/* - * Initialize, or attach to, a simple LRU cache in shared memory. - * - * ctl: address of local (unshared) control structure. - * name: name of SLRU. (This is user-visible, pick with care!) - * nslots: number of page slots to use. - * nlsns: number of LSN groups per page (set to zero if not relevant). - * ctllock: LWLock to use to control access to the shared control structure. - * subdir: PGDATA-relative subdirectory that will contain the files. - * tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks. - * sync_handler: which set of functions to use to handle sync requests - */ -void -SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, - LWLock *ctllock, const char *subdir, int tranche_id, - SyncRequestHandler sync_handler) -{ - SlruShared shared; - bool found; - - shared = (SlruShared) ShmemInitStruct(name, - SimpleLruShmemSize(nslots, nlsns), - &found); - - if (!IsUnderPostmaster) - { - /* Initialize locks and shared memory area */ - char *ptr; - Size offset; - int slotno; - - Assert(!found); - - memset(shared, 0, sizeof(SlruSharedData)); - - shared->ControlLock = ctllock; - - shared->num_slots = nslots; - shared->lsn_groups_per_page = nlsns; - - shared->cur_lru_count = 0; - - /* shared->latest_page_number will be set later */ - - shared->slru_stats_idx = pgstat_get_slru_index(name); - - ptr = (char *) shared; - offset = MAXALIGN(sizeof(SlruSharedData)); - shared->page_buffer = (char **) (ptr + offset); - offset += MAXALIGN(nslots * sizeof(char *)); - shared->page_status = (SlruPageStatus *) (ptr + offset); - offset += MAXALIGN(nslots * sizeof(SlruPageStatus)); - shared->page_dirty = (bool *) (ptr + offset); - offset += MAXALIGN(nslots * sizeof(bool)); - shared->page_number = (int *) (ptr + offset); - offset += MAXALIGN(nslots * sizeof(int)); - shared->page_lru_count = (int *) (ptr + offset); - offset += MAXALIGN(nslots * sizeof(int)); - - /* Initialize LWLocks */ - shared->buffer_locks = (LWLockPadded *) (ptr + offset); - offset += MAXALIGN(nslots * sizeof(LWLockPadded)); - - if (nlsns > 0) - { - shared->group_lsn = (XLogRecPtr *) (ptr + offset); - offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); - } - - ptr += BUFFERALIGN(offset); - for (slotno = 0; slotno < nslots; slotno++) - { - LWLockInitialize(&shared->buffer_locks[slotno].lock, - tranche_id); - - shared->page_buffer[slotno] = ptr; - shared->page_status[slotno] = SLRU_PAGE_EMPTY; - shared->page_dirty[slotno] = false; - shared->page_lru_count[slotno] = 0; - ptr += BLCKSZ; - } - - /* Should fit to estimated shmem size */ - Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns)); - } - else - Assert(found); - - /* - * Initialize the unshared control struct, including directory path. We - * assume caller set PagePrecedes. - */ - ctl->shared = shared; - ctl->sync_handler = sync_handler; - strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir)); -} - -/* - * Initialize (or reinitialize) a page to zeroes. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -int -SimpleLruZeroPage(SlruCtl ctl, int pageno) -{ - SlruShared shared = ctl->shared; - int slotno; - - /* Find a suitable buffer slot for the page */ - slotno = SlruSelectLRUPage(ctl, pageno); - Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY || - (shared->page_status[slotno] == SLRU_PAGE_VALID && - !shared->page_dirty[slotno]) || - shared->page_number[slotno] == pageno); - - /* Mark the slot as containing this page */ - shared->page_number[slotno] = pageno; - shared->page_status[slotno] = SLRU_PAGE_VALID; - shared->page_dirty[slotno] = true; - SlruRecentlyUsed(shared, slotno); - - /* Set the buffer to zeroes */ - MemSet(shared->page_buffer[slotno], 0, BLCKSZ); - - /* Set the LSNs for this new page to zero */ - SimpleLruZeroLSNs(ctl, slotno); - - /* Assume this page is now the latest active page */ - shared->latest_page_number = pageno; - - /* update the stats counter of zeroed pages */ - pgstat_count_slru_page_zeroed(shared->slru_stats_idx); - - return slotno; -} - -/* - * Zero all the LSNs we store for this slru page. - * - * This should be called each time we create a new page, and each time we read - * in a page from disk into an existing buffer. (Such an old page cannot - * have any interesting LSNs, since we'd have flushed them before writing - * the page in the first place.) - * - * This assumes that InvalidXLogRecPtr is bitwise-all-0. - */ -static void -SimpleLruZeroLSNs(SlruCtl ctl, int slotno) -{ - SlruShared shared = ctl->shared; - - if (shared->lsn_groups_per_page > 0) - MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0, - shared->lsn_groups_per_page * sizeof(XLogRecPtr)); -} - -/* - * Wait for any active I/O on a page slot to finish. (This does not - * guarantee that new I/O hasn't been started before we return, though. - * In fact the slot might not even contain the same page anymore.) - * - * Control lock must be held at entry, and will be held at exit. - */ -static void -SimpleLruWaitIO(SlruCtl ctl, int slotno) -{ - SlruShared shared = ctl->shared; - - /* See notes at top of file */ - LWLockRelease(shared->ControlLock); - LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED); - LWLockRelease(&shared->buffer_locks[slotno].lock); - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); - - /* - * If the slot is still in an io-in-progress state, then either someone - * already started a new I/O on the slot, or a previous I/O failed and - * neglected to reset the page state. That shouldn't happen, really, but - * it seems worth a few extra cycles to check and recover from it. We can - * cheaply test for failure by seeing if the buffer lock is still held (we - * assume that transaction abort would release the lock). - */ - if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS || - shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS) - { - if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED)) - { - /* indeed, the I/O must have failed */ - if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS) - shared->page_status[slotno] = SLRU_PAGE_EMPTY; - else /* write_in_progress */ - { - shared->page_status[slotno] = SLRU_PAGE_VALID; - shared->page_dirty[slotno] = true; - } - LWLockRelease(&shared->buffer_locks[slotno].lock); - } - } -} - -/* - * Find a page in a shared buffer, reading it in if necessary. - * The page number must correspond to an already-initialized page. - * - * If write_ok is true then it is OK to return a page that is in - * WRITE_IN_PROGRESS state; it is the caller's responsibility to be sure - * that modification of the page is safe. If write_ok is false then we - * will not return the page until it is not undergoing active I/O. - * - * The passed-in xid is used only for error reporting, and may be - * InvalidTransactionId if no specific xid is associated with the action. - * - * Return value is the shared-buffer slot number now holding the page. - * The buffer's LRU access info is updated. - * - * Control lock must be held at entry, and will be held at exit. - */ -int -SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, - TransactionId xid) -{ - SlruShared shared = ctl->shared; - - /* Outer loop handles restart if we must wait for someone else's I/O */ - for (;;) - { - int slotno; - bool ok; - - /* See if page already is in memory; if not, pick victim slot */ - slotno = SlruSelectLRUPage(ctl, pageno); - - /* Did we find the page in memory? */ - if (shared->page_number[slotno] == pageno && - shared->page_status[slotno] != SLRU_PAGE_EMPTY) - { - /* - * If page is still being read in, we must wait for I/O. Likewise - * if the page is being written and the caller said that's not OK. - */ - if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS || - (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && - !write_ok)) - { - SimpleLruWaitIO(ctl, slotno); - /* Now we must recheck state from the top */ - continue; - } - /* Otherwise, it's ready to use */ - SlruRecentlyUsed(shared, slotno); - - /* update the stats counter of pages found in the SLRU */ - pgstat_count_slru_page_hit(shared->slru_stats_idx); - - return slotno; - } - - /* We found no match; assert we selected a freeable slot */ - Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY || - (shared->page_status[slotno] == SLRU_PAGE_VALID && - !shared->page_dirty[slotno])); - - /* Mark the slot read-busy */ - shared->page_number[slotno] = pageno; - shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS; - shared->page_dirty[slotno] = false; - - /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ - LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE); - - /* Release control lock while doing I/O */ - LWLockRelease(shared->ControlLock); - - /* Do the read */ - ok = SlruPhysicalReadPage(ctl, pageno, slotno); - - /* Set the LSNs for this newly read-in page to zero */ - SimpleLruZeroLSNs(ctl, slotno); - - /* Re-acquire control lock and update page state */ - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); - - Assert(shared->page_number[slotno] == pageno && - shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS && - !shared->page_dirty[slotno]); - - shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY; - - LWLockRelease(&shared->buffer_locks[slotno].lock); - - /* Now it's okay to ereport if we failed */ - if (!ok) - SlruReportIOError(ctl, pageno, xid); - - SlruRecentlyUsed(shared, slotno); - - /* update the stats counter of pages not found in SLRU */ - pgstat_count_slru_page_read(shared->slru_stats_idx); - - return slotno; - } -} - -/* - * Find a page in a shared buffer, reading it in if necessary. - * The page number must correspond to an already-initialized page. - * The caller must intend only read-only access to the page. - * - * The passed-in xid is used only for error reporting, and may be - * InvalidTransactionId if no specific xid is associated with the action. - * - * Return value is the shared-buffer slot number now holding the page. - * The buffer's LRU access info is updated. - * - * Control lock must NOT be held at entry, but will be held at exit. - * It is unspecified whether the lock will be shared or exclusive. - */ -int -SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) -{ - SlruShared shared = ctl->shared; - int slotno; - - /* Try to find the page while holding only shared lock */ - LWLockAcquire(shared->ControlLock, LW_SHARED); - - /* See if page is already in a buffer */ - for (slotno = 0; slotno < shared->num_slots; slotno++) - { - if (shared->page_number[slotno] == pageno && - shared->page_status[slotno] != SLRU_PAGE_EMPTY && - shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) - { - /* See comments for SlruRecentlyUsed macro */ - SlruRecentlyUsed(shared, slotno); - - /* update the stats counter of pages found in the SLRU */ - pgstat_count_slru_page_hit(shared->slru_stats_idx); - - return slotno; - } - } - - /* No luck, so switch to normal exclusive lock and do regular read */ - LWLockRelease(shared->ControlLock); - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); - - return SimpleLruReadPage(ctl, pageno, true, xid); -} - -/* - * Write a page from a shared buffer, if necessary. - * Does nothing if the specified slot is not dirty. - * - * NOTE: only one write attempt is made here. Hence, it is possible that - * the page is still dirty at exit (if someone else re-dirtied it during - * the write). However, we *do* attempt a fresh write even if the page - * is already being written; this is for checkpoints. - * - * Control lock must be held at entry, and will be held at exit. - */ -static void -SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata) -{ - SlruShared shared = ctl->shared; - int pageno = shared->page_number[slotno]; - bool ok; - - /* If a write is in progress, wait for it to finish */ - while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS && - shared->page_number[slotno] == pageno) - { - SimpleLruWaitIO(ctl, slotno); - } - - /* - * Do nothing if page is not dirty, or if buffer no longer contains the - * same page we were called for. - */ - if (!shared->page_dirty[slotno] || - shared->page_status[slotno] != SLRU_PAGE_VALID || - shared->page_number[slotno] != pageno) - return; - - /* - * Mark the slot write-busy, and clear the dirtybit. After this point, a - * transaction status update on this page will mark it dirty again. - */ - shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS; - shared->page_dirty[slotno] = false; - - /* Acquire per-buffer lock (cannot deadlock, see notes at top) */ - LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE); - - /* Release control lock while doing I/O */ - LWLockRelease(shared->ControlLock); - - /* Do the write */ - ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); - - /* If we failed, and we're in a flush, better close the files */ - if (!ok && fdata) - { - int i; - - for (i = 0; i < fdata->num_files; i++) - CloseTransientFile(fdata->fd[i]); - } - - /* Re-acquire control lock and update page state */ - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); - - Assert(shared->page_number[slotno] == pageno && - shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS); - - /* If we failed to write, mark the page dirty again */ - if (!ok) - shared->page_dirty[slotno] = true; - - shared->page_status[slotno] = SLRU_PAGE_VALID; - - LWLockRelease(&shared->buffer_locks[slotno].lock); - - /* Now it's okay to ereport if we failed */ - if (!ok) - SlruReportIOError(ctl, pageno, InvalidTransactionId); - - /* If part of a checkpoint, count this as a buffer written. */ - if (fdata) - CheckpointStats.ckpt_bufs_written++; -} - -/* - * Wrapper of SlruInternalWritePage, for external callers. - * fdata is always passed a NULL here. - */ -void -SimpleLruWritePage(SlruCtl ctl, int slotno) -{ - SlruInternalWritePage(ctl, slotno, NULL); -} +static void SlruInternalDeleteSegment(int slru_id, int segno); /* * Return whether the given page exists on disk. @@ -624,592 +64,24 @@ SimpleLruWritePage(SlruCtl ctl, int slotno) * large enough to contain the given page. */ bool -SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno) +SimpleLruDoesPhysicalPageExist(int slru_id, int pageno) { - int segno = pageno / SLRU_PAGES_PER_SEGMENT; - int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; - int offset = rpageno * BLCKSZ; - char path[MAXPGPATH]; - int fd; - bool result; - off_t endpos; - - /* update the stats counter of checked pages */ - pgstat_count_slru_page_exists(ctl->shared->slru_stats_idx); - - SlruFileName(ctl, path, segno); - - fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); - if (fd < 0) - { - /* expected: file doesn't exist */ - if (errno == ENOENT) - return false; - - /* report error normally */ - slru_errcause = SLRU_OPEN_FAILED; - slru_errno = errno; - SlruReportIOError(ctl, pageno, 0); - } - - if ((endpos = lseek(fd, 0, SEEK_END)) < 0) - { - slru_errcause = SLRU_SEEK_FAILED; - slru_errno = errno; - SlruReportIOError(ctl, pageno, 0); - } - - result = endpos >= (off_t) (offset + BLCKSZ); - - if (CloseTransientFile(fd) != 0) - { - slru_errcause = SLRU_CLOSE_FAILED; - slru_errno = errno; - return false; - } - - return result; -} - -/* - * Physical read of a (previously existing) page into a buffer slot - * - * On failure, we cannot just ereport(ERROR) since caller has put state in - * shared memory that must be undone. So, we return false and save enough - * info in static variables to let SlruReportIOError make the report. - * - * For now, assume it's not worth keeping a file pointer open across - * read/write operations. We could cache one virtual file pointer ... - */ -static bool -SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) -{ - SlruShared shared = ctl->shared; - int segno = pageno / SLRU_PAGES_PER_SEGMENT; - int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; - off_t offset = rpageno * BLCKSZ; - char path[MAXPGPATH]; - int fd; - - SlruFileName(ctl, path, segno); - - /* - * In a crash-and-restart situation, it's possible for us to receive - * commands to set the commit status of transactions whose bits are in - * already-truncated segments of the commit log (see notes in - * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case - * where the file doesn't exist, and return zeroes instead. - */ - fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); - if (fd < 0) - { - if (errno != ENOENT || !InRecovery) - { - slru_errcause = SLRU_OPEN_FAILED; - slru_errno = errno; - return false; - } - - ereport(LOG, - (errmsg("file \"%s\" doesn't exist, reading as zeroes", - path))); - MemSet(shared->page_buffer[slotno], 0, BLCKSZ); - return true; - } - - errno = 0; - pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); - if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ) - { - pgstat_report_wait_end(); - slru_errcause = SLRU_READ_FAILED; - slru_errno = errno; - CloseTransientFile(fd); - return false; - } - pgstat_report_wait_end(); - - if (CloseTransientFile(fd) != 0) - { - slru_errcause = SLRU_CLOSE_FAILED; - slru_errno = errno; - return false; - } - - return true; -} - -/* - * Physical write of a page from a buffer slot - * - * On failure, we cannot just ereport(ERROR) since caller has put state in - * shared memory that must be undone. So, we return false and save enough - * info in static variables to let SlruReportIOError make the report. - * - * For now, assume it's not worth keeping a file pointer open across - * independent read/write operations. We do batch operations during - * SimpleLruWriteAll, though. - * - * fdata is NULL for a standalone write, pointer to open-file info during - * SimpleLruWriteAll. - */ -static bool -SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata) -{ - SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; off_t offset = rpageno * BLCKSZ; - char path[MAXPGPATH]; - int fd = -1; - - /* update the stats counter of written pages */ - pgstat_count_slru_page_written(shared->slru_stats_idx); - - /* - * Honor the write-WAL-before-data rule, if appropriate, so that we do not - * write out data before associated WAL records. This is the same action - * performed during FlushBuffer() in the main buffer manager. - */ - if (shared->group_lsn != NULL) - { - /* - * We must determine the largest async-commit LSN for the page. This - * is a bit tedious, but since this entire function is a slow path - * anyway, it seems better to do this here than to maintain a per-page - * LSN variable (which'd need an extra comparison in the - * transaction-commit path). - */ - XLogRecPtr max_lsn; - int lsnindex, - lsnoff; - - lsnindex = slotno * shared->lsn_groups_per_page; - max_lsn = shared->group_lsn[lsnindex++]; - for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++) - { - XLogRecPtr this_lsn = shared->group_lsn[lsnindex++]; - - if (max_lsn < this_lsn) - max_lsn = this_lsn; - } - - if (!XLogRecPtrIsInvalid(max_lsn)) - { - /* - * As noted above, elog(ERROR) is not acceptable here, so if - * XLogFlush were to fail, we must PANIC. This isn't much of a - * restriction because XLogFlush is just about all critical - * section anyway, but let's make sure. - */ - START_CRIT_SECTION(); - XLogFlush(max_lsn); - END_CRIT_SECTION(); - } - } - - /* - * During a WriteAll, we may already have the desired file open. - */ - if (fdata) - { - int i; - - for (i = 0; i < fdata->num_files; i++) - { - if (fdata->segno[i] == segno) - { - fd = fdata->fd[i]; - break; - } - } - } - - if (fd < 0) - { - /* - * If the file doesn't already exist, we should create it. It is - * possible for this to need to happen when writing a page that's not - * first in its segment; we assume the OS can cope with that. (Note: - * it might seem that it'd be okay to create files only when - * SimpleLruZeroPage is called for the first page of a segment. - * However, if after a crash and restart the REDO logic elects to - * replay the log from a checkpoint before the latest one, then it's - * possible that we will get commands to set transaction status of - * transactions that have already been truncated from the commit log. - * Easiest way to deal with that is to accept references to - * nonexistent files here and in SlruPhysicalReadPage.) - * - * Note: it is possible for more than one backend to be executing this - * code simultaneously for different pages of the same file. Hence, - * don't use O_EXCL or O_TRUNC or anything like that. - */ - SlruFileName(ctl, path, segno); - fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); - if (fd < 0) - { - slru_errcause = SLRU_OPEN_FAILED; - slru_errno = errno; - return false; - } - - if (fdata) - { - if (fdata->num_files < MAX_WRITEALL_BUFFERS) - { - fdata->fd[fdata->num_files] = fd; - fdata->segno[fdata->num_files] = segno; - fdata->num_files++; - } - else - { - /* - * In the unlikely event that we exceed MAX_FLUSH_BUFFERS, - * fall back to treating it as a standalone write. - */ - fdata = NULL; - } - } - } - - errno = 0; - pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); - if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ) - { - pgstat_report_wait_end(); - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - slru_errcause = SLRU_WRITE_FAILED; - slru_errno = errno; - if (!fdata) - CloseTransientFile(fd); - return false; - } - pgstat_report_wait_end(); - - /* Queue up a sync request for the checkpointer. */ - if (ctl->sync_handler != SYNC_HANDLER_NONE) - { - FileTag tag; - - INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); - if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false)) - { - /* No space to enqueue sync request. Do it synchronously. */ - pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); - if (pg_fsync(fd) != 0) - { - pgstat_report_wait_end(); - slru_errcause = SLRU_FSYNC_FAILED; - slru_errno = errno; - CloseTransientFile(fd); - return false; - } - pgstat_report_wait_end(); - } - } - - /* Close file, unless part of flush request. */ - if (!fdata) - { - if (CloseTransientFile(fd) != 0) - { - slru_errcause = SLRU_CLOSE_FAILED; - slru_errno = errno; - return false; - } - } - - return true; -} - -/* - * Issue the error message after failure of SlruPhysicalReadPage or - * SlruPhysicalWritePage. Call this after cleaning up shared-memory state. - */ -static void -SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid) -{ - int segno = pageno / SLRU_PAGES_PER_SEGMENT; - int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; - int offset = rpageno * BLCKSZ; - char path[MAXPGPATH]; - - SlruFileName(ctl, path, segno); - errno = slru_errno; - switch (slru_errcause) - { - case SLRU_OPEN_FAILED: - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not access status of transaction %u", xid), - errdetail("Could not open file \"%s\": %m.", path))); - break; - case SLRU_SEEK_FAILED: - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not access status of transaction %u", xid), - errdetail("Could not seek in file \"%s\" to offset %d: %m.", - path, offset))); - break; - case SLRU_READ_FAILED: - if (errno) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not access status of transaction %u", xid), - errdetail("Could not read from file \"%s\" at offset %d: %m.", - path, offset))); - else - ereport(ERROR, - (errmsg("could not access status of transaction %u", xid), - errdetail("Could not read from file \"%s\" at offset %d: read too few bytes.", path, offset))); - break; - case SLRU_WRITE_FAILED: - if (errno) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not access status of transaction %u", xid), - errdetail("Could not write to file \"%s\" at offset %d: %m.", - path, offset))); - else - ereport(ERROR, - (errmsg("could not access status of transaction %u", xid), - errdetail("Could not write to file \"%s\" at offset %d: wrote too few bytes.", - path, offset))); - break; - case SLRU_FSYNC_FAILED: - ereport(data_sync_elevel(ERROR), - (errcode_for_file_access(), - errmsg("could not access status of transaction %u", xid), - errdetail("Could not fsync file \"%s\": %m.", - path))); - break; - case SLRU_CLOSE_FAILED: - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not access status of transaction %u", xid), - errdetail("Could not close file \"%s\": %m.", - path))); - break; - default: - /* can't get here, we trust */ - elog(ERROR, "unrecognized SimpleLru error cause: %d", - (int) slru_errcause); - break; - } -} - -/* - * Select the slot to re-use when we need a free slot. - * - * The target page number is passed because we need to consider the - * possibility that some other process reads in the target page while - * we are doing I/O to free a slot. Hence, check or recheck to see if - * any slot already holds the target page, and return that slot if so. - * Thus, the returned slot is *either* a slot already holding the pageno - * (could be any state except EMPTY), *or* a freeable slot (state EMPTY - * or CLEAN). - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -SlruSelectLRUPage(SlruCtl ctl, int pageno) -{ - SlruShared shared = ctl->shared; - - /* Outer loop handles restart after I/O */ - for (;;) - { - int slotno; - int cur_count; - int bestvalidslot = 0; /* keep compiler quiet */ - int best_valid_delta = -1; - int best_valid_page_number = 0; /* keep compiler quiet */ - int bestinvalidslot = 0; /* keep compiler quiet */ - int best_invalid_delta = -1; - int best_invalid_page_number = 0; /* keep compiler quiet */ - - /* See if page already has a buffer assigned */ - for (slotno = 0; slotno < shared->num_slots; slotno++) - { - if (shared->page_number[slotno] == pageno && - shared->page_status[slotno] != SLRU_PAGE_EMPTY) - return slotno; - } - - /* - * If we find any EMPTY slot, just select that one. Else choose a - * victim page to replace. We normally take the least recently used - * valid page, but we will never take the slot containing - * latest_page_number, even if it appears least recently used. We - * will select a slot that is already I/O busy only if there is no - * other choice: a read-busy slot will not be least recently used once - * the read finishes, and waiting for an I/O on a write-busy slot is - * inferior to just picking some other slot. Testing shows the slot - * we pick instead will often be clean, allowing us to begin a read at - * once. - * - * Normally the page_lru_count values will all be different and so - * there will be a well-defined LRU page. But since we allow - * concurrent execution of SlruRecentlyUsed() within - * SimpleLruReadPage_ReadOnly(), it is possible that multiple pages - * acquire the same lru_count values. In that case we break ties by - * choosing the furthest-back page. - * - * Notice that this next line forcibly advances cur_lru_count to a - * value that is certainly beyond any value that will be in the - * page_lru_count array after the loop finishes. This ensures that - * the next execution of SlruRecentlyUsed will mark the page newly - * used, even if it's for a page that has the current counter value. - * That gets us back on the path to having good data when there are - * multiple pages with the same lru_count. - */ - cur_count = (shared->cur_lru_count)++; - for (slotno = 0; slotno < shared->num_slots; slotno++) - { - int this_delta; - int this_page_number; - - if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) - return slotno; - this_delta = cur_count - shared->page_lru_count[slotno]; - if (this_delta < 0) - { - /* - * Clean up in case shared updates have caused cur_count - * increments to get "lost". We back off the page counts, - * rather than trying to increase cur_count, to avoid any - * question of infinite loops or failure in the presence of - * wrapped-around counts. - */ - shared->page_lru_count[slotno] = cur_count; - this_delta = 0; - } - this_page_number = shared->page_number[slotno]; - if (this_page_number == shared->latest_page_number) - continue; - if (shared->page_status[slotno] == SLRU_PAGE_VALID) - { - if (this_delta > best_valid_delta || - (this_delta == best_valid_delta && - ctl->PagePrecedes(this_page_number, - best_valid_page_number))) - { - bestvalidslot = slotno; - best_valid_delta = this_delta; - best_valid_page_number = this_page_number; - } - } - else - { - if (this_delta > best_invalid_delta || - (this_delta == best_invalid_delta && - ctl->PagePrecedes(this_page_number, - best_invalid_page_number))) - { - bestinvalidslot = slotno; - best_invalid_delta = this_delta; - best_invalid_page_number = this_page_number; - } - } - } - - /* - * If all pages (except possibly the latest one) are I/O busy, we'll - * have to wait for an I/O to complete and then retry. In that - * unhappy case, we choose to wait for the I/O on the least recently - * used slot, on the assumption that it was likely initiated first of - * all the I/Os in progress and may therefore finish first. - */ - if (best_valid_delta < 0) - { - SimpleLruWaitIO(ctl, bestinvalidslot); - continue; - } - - /* - * If the selected page is clean, we're set. - */ - if (!shared->page_dirty[bestvalidslot]) - return bestvalidslot; - - /* - * Write the page. - */ - SlruInternalWritePage(ctl, bestvalidslot, NULL); - - /* - * Now loop back and try again. This is the easiest way of dealing - * with corner cases such as the victim page being re-dirtied while we - * wrote it. - */ - } -} - -/* - * Write dirty pages to disk during checkpoint or database shutdown. Flushing - * is deferred until the next call to ProcessSyncRequests(), though we do fsync - * the containing directory here to make sure that newly created directory - * entries are on disk. - */ -void -SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied) -{ - SlruShared shared = ctl->shared; - SlruWriteAllData fdata; - int slotno; - int pageno = 0; - int i; - bool ok; - - /* update the stats counter of flushes */ - pgstat_count_slru_flush(shared->slru_stats_idx); - - /* - * Find and write dirty pages - */ - fdata.num_files = 0; + off_t size; + RelFileLocator rlocator = SlruRelFileLocator(slru_id, segno); + SMgrFileHandle sfile = smgropen(rlocator, InvalidBackendId, MAIN_FORKNUM); - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); - - for (slotno = 0; slotno < shared->num_slots; slotno++) - { - SlruInternalWritePage(ctl, slotno, &fdata); - - /* - * In some places (e.g. checkpoints), we cannot assert that the slot - * is clean now, since another process might have re-dirtied it - * already. That's okay. - */ - Assert(allow_redirtied || - shared->page_status[slotno] == SLRU_PAGE_EMPTY || - (shared->page_status[slotno] == SLRU_PAGE_VALID && - !shared->page_dirty[slotno])); - } - - LWLockRelease(shared->ControlLock); + /* update the stats counter of checked pages */ + pgstat_count_slru_page_exists(slru_id); - /* - * Now close any files that were open - */ - ok = true; - for (i = 0; i < fdata.num_files; i++) - { - if (CloseTransientFile(fdata.fd[i]) != 0) - { - slru_errcause = SLRU_CLOSE_FAILED; - slru_errno = errno; - pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; - ok = false; - } - } - if (!ok) - SlruReportIOError(ctl, pageno, InvalidTransactionId); + if (smgrexists(sfile)) + size = smgrnblocks(sfile); + else + size = 0; - /* Ensure that directory entries for new files are on disk. */ - if (ctl->sync_handler != SYNC_HANDLER_NONE) - fsync_fname(ctl->Dir, true); + return size >= offset + BLCKSZ; } /* @@ -1224,75 +96,14 @@ SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied) * after it has accrued freshly-written data. */ void -SimpleLruTruncate(SlruCtl ctl, int cutoffPage) +SimpleLruTruncate(int slru_id, SlruPagePrecedesFunction PagePrecedes, int cutoffPage) { - SlruShared shared = ctl->shared; - int slotno; - /* update the stats counter of truncates */ - pgstat_count_slru_truncate(shared->slru_stats_idx); - - /* - * Scan shared memory and remove any pages preceding the cutoff page, to - * ensure we won't rewrite them later. (Since this is normally called in - * or just after a checkpoint, any dirty pages should have been flushed - * already ... we're just being extra careful here.) - */ - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); - -restart: - - /* - * While we are holding the lock, make an important safety check: the - * current endpoint page must not be eligible for removal. - */ - if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage)) - { - LWLockRelease(shared->ControlLock); - ereport(LOG, - (errmsg("could not truncate directory \"%s\": apparent wraparound", - ctl->Dir))); - return; - } - - for (slotno = 0; slotno < shared->num_slots; slotno++) - { - if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) - continue; - if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage)) - continue; - - /* - * If page is clean, just change state to EMPTY (expected case). - */ - if (shared->page_status[slotno] == SLRU_PAGE_VALID && - !shared->page_dirty[slotno]) - { - shared->page_status[slotno] = SLRU_PAGE_EMPTY; - continue; - } - - /* - * Hmm, we have (or may have) I/O operations acting on the page, so - * we've got to wait for them to finish and then start again. This is - * the same logic as in SlruSelectLRUPage. (XXX if page is dirty, - * wouldn't it be OK to just discard it without writing it? - * SlruMayDeleteSegment() uses a stricter qualification, so we might - * not delete this page in the end; even if we don't delete it, we - * won't have cause to read its data again. For now, keep the logic - * the same as it was.) - */ - if (shared->page_status[slotno] == SLRU_PAGE_VALID) - SlruInternalWritePage(ctl, slotno, NULL); - else - SimpleLruWaitIO(ctl, slotno); - goto restart; - } - - LWLockRelease(shared->ControlLock); + pgstat_count_slru_truncate(slru_id); /* Now we can remove the old segment(s) */ - (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage); + (void) SlruScanDirectory(slru_id, PagePrecedes, SlruScanDirCbDeleteCutoff, + &cutoffPage); } /* @@ -1302,77 +113,22 @@ restart: * they either can't yet contain anything, or have already been cleaned out. */ static void -SlruInternalDeleteSegment(SlruCtl ctl, int segno) +SlruInternalDeleteSegment(int slru_id, int segno) { - char path[MAXPGPATH]; - - /* Forget any fsync requests queued for this segment. */ - if (ctl->sync_handler != SYNC_HANDLER_NONE) - { - FileTag tag; - - INIT_SLRUFILETAG(tag, ctl->sync_handler, segno); - RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true); - } + RelFileLocator rlocator = SlruRelFileLocator(slru_id, segno); + SMgrFileHandle sfile = smgropen(rlocator, InvalidBackendId, MAIN_FORKNUM); /* Unlink the file. */ - SlruFileName(ctl, path, segno); - ereport(DEBUG2, (errmsg_internal("removing file \"%s\"", path))); - unlink(path); + smgrunlink(sfile, false); } /* * Delete an individual SLRU segment, identified by the segment number. */ void -SlruDeleteSegment(SlruCtl ctl, int segno) +SlruDeleteSegment(int slru_id, int segno) { - SlruShared shared = ctl->shared; - int slotno; - bool did_write; - - /* Clean out any possibly existing references to the segment. */ - LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); -restart: - did_write = false; - for (slotno = 0; slotno < shared->num_slots; slotno++) - { - int pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT; - - if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) - continue; - - /* not the segment we're looking for */ - if (pagesegno != segno) - continue; - - /* If page is clean, just change state to EMPTY (expected case). */ - if (shared->page_status[slotno] == SLRU_PAGE_VALID && - !shared->page_dirty[slotno]) - { - shared->page_status[slotno] = SLRU_PAGE_EMPTY; - continue; - } - - /* Same logic as SimpleLruTruncate() */ - if (shared->page_status[slotno] == SLRU_PAGE_VALID) - SlruInternalWritePage(ctl, slotno, NULL); - else - SimpleLruWaitIO(ctl, slotno); - - did_write = true; - } - - /* - * Be extra careful and re-check. The IO functions release the control - * lock, so new pages could have been read in. - */ - if (did_write) - goto restart; - - SlruInternalDeleteSegment(ctl, segno); - - LWLockRelease(shared->ControlLock); + SlruInternalDeleteSegment(slru_id, segno); } /* @@ -1389,19 +145,21 @@ restart: * first>=cutoff && last>=cutoff: no; every page of this segment is too young */ static bool -SlruMayDeleteSegment(SlruCtl ctl, int segpage, int cutoffPage) +SlruMayDeleteSegment(SlruPagePrecedesFunction PagePrecedes, + int segpage, int cutoffPage) { int seg_last_page = segpage + SLRU_PAGES_PER_SEGMENT - 1; Assert(segpage % SLRU_PAGES_PER_SEGMENT == 0); - return (ctl->PagePrecedes(segpage, cutoffPage) && - ctl->PagePrecedes(seg_last_page, cutoffPage)); + return (PagePrecedes(segpage, cutoffPage) && + PagePrecedes(seg_last_page, cutoffPage)); } #ifdef USE_ASSERT_CHECKING static void -SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset) +SlruPagePrecedesTestOffset(SlruPagePrecedesFunction PagePrecedes, + int per_page, uint32 offset) { TransactionId lhs, rhs; @@ -1426,19 +184,19 @@ SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset) Assert(!TransactionIdPrecedes(rhs, lhs + 1)); Assert(!TransactionIdFollowsOrEquals(lhs, rhs)); Assert(!TransactionIdFollowsOrEquals(rhs, lhs)); - Assert(!ctl->PagePrecedes(lhs / per_page, lhs / per_page)); - Assert(!ctl->PagePrecedes(lhs / per_page, rhs / per_page)); - Assert(!ctl->PagePrecedes(rhs / per_page, lhs / per_page)); - Assert(!ctl->PagePrecedes((lhs - per_page) / per_page, rhs / per_page)); - Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page)); - Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page)); - Assert(ctl->PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page) + Assert(!PagePrecedes(lhs / per_page, lhs / per_page)); + Assert(!PagePrecedes(lhs / per_page, rhs / per_page)); + Assert(!PagePrecedes(rhs / per_page, lhs / per_page)); + Assert(!PagePrecedes((lhs - per_page) / per_page, rhs / per_page)); + Assert(PagePrecedes(rhs / per_page, (lhs - 3 * per_page) / per_page)); + Assert(PagePrecedes(rhs / per_page, (lhs - 2 * per_page) / per_page)); + Assert(PagePrecedes(rhs / per_page, (lhs - 1 * per_page) / per_page) || (1U << 31) % per_page != 0); /* See CommitTsPagePrecedes() */ - Assert(ctl->PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page) + Assert(PagePrecedes((lhs + 1 * per_page) / per_page, rhs / per_page) || (1U << 31) % per_page != 0); - Assert(ctl->PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page)); - Assert(ctl->PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page)); - Assert(!ctl->PagePrecedes(rhs / per_page, (lhs + per_page) / per_page)); + Assert(PagePrecedes((lhs + 2 * per_page) / per_page, rhs / per_page)); + Assert(PagePrecedes((lhs + 3 * per_page) / per_page, rhs / per_page)); + Assert(!PagePrecedes(rhs / per_page, (lhs + per_page) / per_page)); /* * GetNewTransactionId() has assigned the last XID it can safely use, and @@ -1451,7 +209,7 @@ SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset) oldestXact = newestXact + 1; oldestXact -= 1U << 31; oldestPage = oldestXact / per_page; - Assert(!SlruMayDeleteSegment(ctl, + Assert(!SlruMayDeleteSegment(PagePrecedes, (newestPage - newestPage % SLRU_PAGES_PER_SEGMENT), oldestPage)); @@ -1467,7 +225,7 @@ SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset) oldestXact = newestXact + 1; oldestXact -= 1U << 31; oldestPage = oldestXact / per_page; - Assert(!SlruMayDeleteSegment(ctl, + Assert(!SlruMayDeleteSegment(PagePrecedes, (newestPage - newestPage % SLRU_PAGES_PER_SEGMENT), oldestPage)); @@ -1483,12 +241,12 @@ SlruPagePrecedesTestOffset(SlruCtl ctl, int per_page, uint32 offset) * do not apply to them.) */ void -SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page) +SlruPagePrecedesUnitTests(SlruPagePrecedesFunction PagePrecedes, int per_page) { /* Test first, middle and last entries of a page. */ - SlruPagePrecedesTestOffset(ctl, per_page, 0); - SlruPagePrecedesTestOffset(ctl, per_page, per_page / 2); - SlruPagePrecedesTestOffset(ctl, per_page, per_page - 1); + SlruPagePrecedesTestOffset(PagePrecedes, per_page, 0); + SlruPagePrecedesTestOffset(PagePrecedes, per_page, per_page / 2); + SlruPagePrecedesTestOffset(PagePrecedes, per_page, per_page - 1); } #endif @@ -1498,11 +256,12 @@ SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page) * one containing the page passed as "data". */ bool -SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data) +SlruScanDirCbReportPresence(int slru_id, SlruPagePrecedesFunction PagePrecedes, + char *filename, int segpage, void *data) { int cutoffPage = *(int *) data; - if (SlruMayDeleteSegment(ctl, segpage, cutoffPage)) + if (SlruMayDeleteSegment(PagePrecedes, segpage, cutoffPage)) return true; /* found one; don't iterate any more */ return false; /* keep going */ @@ -1513,12 +272,15 @@ SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, int segpage, void *data * This callback deletes segments prior to the one passed in as "data". */ static bool -SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data) +SlruScanDirCbDeleteCutoff(int slru_id, SlruPagePrecedesFunction PagePrecedes, + char *filename, int segpage, void *data) { int cutoffPage = *(int *) data; - if (SlruMayDeleteSegment(ctl, segpage, cutoffPage)) - SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT); + if (SlruMayDeleteSegment(PagePrecedes, segpage, cutoffPage)) + { + SlruDeleteSegment(slru_id, segpage / SLRU_PAGES_PER_SEGMENT); + } return false; /* keep going */ } @@ -1528,9 +290,10 @@ SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data) * This callback deletes all segments. */ bool -SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data) +SlruScanDirCbDeleteAll(int slru_id, SlruPagePrecedesFunction PagePrecedes, + char *filename, int segpage, void *data) { - SlruInternalDeleteSegment(ctl, segpage / SLRU_PAGES_PER_SEGMENT); + SlruInternalDeleteSegment(slru_id, segpage / SLRU_PAGES_PER_SEGMENT); return false; /* keep going */ } @@ -1551,16 +314,20 @@ SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data) * Note that no locking is applied. */ bool -SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) +SlruScanDirectory(int slru_id, SlruPagePrecedesFunction PagePrecedes, + SlruScanCallback callback, void *data) { bool retval = false; DIR *cldir; struct dirent *clde; int segno; int segpage; + const char *path; + + path = slru_dirs[slru_id]; - cldir = AllocateDir(ctl->Dir); - while ((clde = ReadDir(cldir, ctl->Dir)) != NULL) + cldir = AllocateDir(path); + while ((clde = ReadDir(cldir, path)) != NULL) { size_t len; @@ -1573,8 +340,8 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) segpage = segno * SLRU_PAGES_PER_SEGMENT; elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s", - ctl->Dir, clde->d_name); - retval = callback(ctl, clde->d_name, segpage, data); + path, clde->d_name); + retval = callback(slru_id, PagePrecedes, clde->d_name, segpage, data); if (retval) break; } @@ -1585,29 +352,74 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) } /* - * Individual SLRUs (clog, ...) have to provide a sync.c handler function so - * that they can provide the correct "SlruCtl" (otherwise we don't know how to - * build the path), but they just forward to this common implementation that - * performs the fsync. + * Read a buffer. Buffer is pinned on return. + */ +Buffer +ReadSlruBuffer(int slru_id, int pageno, ReadBufferMode mode) +{ + int segno = pageno / SLRU_PAGES_PER_SEGMENT; + int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; + RelFileLocator rlocator = SlruRelFileLocator(slru_id, segno); + Buffer buffer; + bool hit; + + /* Try to avoid doing a buffer mapping table lookup for repeated access. */ + buffer = slru_recent_buffers[slru_id].recent_buffer; + if (slru_recent_buffers[slru_id].pageno == pageno && + BufferIsValid(buffer) && + ReadRecentBuffer(rlocator, MAIN_FORKNUM, pageno, buffer)) + { + pgstat_count_slru_page_hit(slru_id); + return buffer; + } + + /* Regular lookup. */ + buffer = ReadBufferWithoutRelcacheWithHit(rlocator, MAIN_FORKNUM, rpageno, + mode, NULL, true, &hit); + + /* Remember where this page is for next time. */ + slru_recent_buffers[slru_id].pageno = pageno; + slru_recent_buffers[slru_id].recent_buffer = buffer; + + if (hit) + pgstat_count_slru_page_hit(slru_id); + + return buffer; +} + +/* + * Zero-initialize a buffer. Buffer is pinned and exclusively locked on return. */ -int -SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path) +Buffer +ZeroSlruBuffer(int slru_id, int pageno) { - int fd; - int save_errno; - int result; + int segno = pageno / SLRU_PAGES_PER_SEGMENT; + int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; + RelFileLocator rlocator = SlruRelFileLocator(slru_id, segno); + Buffer buffer; + SMgrFileHandle sfile; - SlruFileName(ctl, path, ftag->segno); + sfile = smgropen(rlocator, InvalidBackendId, MAIN_FORKNUM); + if (!smgrexists(sfile)) + smgrcreate(sfile, false); + + buffer = ReadBufferWithoutRelcache(rlocator, MAIN_FORKNUM, rpageno, RBM_ZERO_AND_LOCK, NULL, true); - fd = OpenTransientFile(path, O_RDWR | PG_BINARY); - if (fd < 0) - return -1; + /* Remember where this page is for next time. */ + slru_recent_buffers[slru_id].pageno = pageno; + slru_recent_buffers[slru_id].recent_buffer = buffer; - result = pg_fsync(fd); - save_errno = errno; + pgstat_count_slru_page_zeroed(slru_id); - CloseTransientFile(fd); + return buffer; +} + +bool +ProbeSlruBuffer(int slru_id, int pageno) +{ + int segno = pageno / SLRU_PAGES_PER_SEGMENT; + int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; + RelFileLocator rlocator = SlruRelFileLocator(slru_id, segno); - errno = save_errno; - return result; + return BufferProbe(rlocator, MAIN_FORKNUM, rpageno); } diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index 66d35481552..47e00c18766 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -32,6 +32,7 @@ #include "access/subtrans.h" #include "access/transam.h" #include "pg_trace.h" +#include "storage/bufmgr.h" #include "utils/snapmgr.h" @@ -49,21 +50,13 @@ */ /* We need four bytes per xact */ -#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId)) +#define SUBTRANS_XACTS_PER_PAGE ((BLCKSZ - SizeOfPageHeaderData) / sizeof(TransactionId)) #define TransactionIdToPage(xid) ((xid) / (TransactionId) SUBTRANS_XACTS_PER_PAGE) #define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE) -/* - * Link to shared-memory data structures for SUBTRANS control - */ -static SlruCtlData SubTransCtlData; - -#define SubTransCtl (&SubTransCtlData) - - -static int ZeroSUBTRANSPage(int pageno); +static Buffer ZeroSUBTRANSPage(int pageno); static bool SubTransPagePrecedes(int page1, int page2); @@ -75,16 +68,15 @@ SubTransSetParent(TransactionId xid, TransactionId parent) { int pageno = TransactionIdToPage(xid); int entryno = TransactionIdToEntry(xid); - int slotno; TransactionId *ptr; + Buffer buffer; Assert(TransactionIdIsValid(parent)); Assert(TransactionIdFollows(xid, parent)); - LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); - - slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid); - ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; + buffer = ReadSlruBuffer(SLRU_SUBTRANS_ID, pageno, RBM_NORMAL); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + ptr = (TransactionId *) PageGetContents(BufferGetPage(buffer)); ptr += entryno; /* @@ -96,10 +88,10 @@ SubTransSetParent(TransactionId xid, TransactionId parent) { Assert(*ptr == InvalidTransactionId); *ptr = parent; - SubTransCtl->shared->page_dirty[slotno] = true; + MarkBufferDirty(buffer); } - LWLockRelease(SubtransSLRULock); + UnlockReleaseBuffer(buffer); } /* @@ -110,9 +102,9 @@ SubTransGetParent(TransactionId xid) { int pageno = TransactionIdToPage(xid); int entryno = TransactionIdToEntry(xid); - int slotno; TransactionId *ptr; TransactionId parent; + Buffer buffer; /* Can't ask about stuff that might not be around anymore */ Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); @@ -121,15 +113,14 @@ SubTransGetParent(TransactionId xid) if (!TransactionIdIsNormal(xid)) return InvalidTransactionId; - /* lock is acquired by SimpleLruReadPage_ReadOnly */ + buffer = ReadSlruBuffer(SLRU_SUBTRANS_ID, pageno, RBM_NORMAL); - slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid); - ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; + ptr = (TransactionId *) PageGetContents(BufferGetPage(buffer)); ptr += entryno; parent = *ptr; - LWLockRelease(SubtransSLRULock); + ReleaseBuffer(buffer); return parent; } @@ -177,26 +168,6 @@ SubTransGetTopmostTransaction(TransactionId xid) return previousXid; } - -/* - * Initialization of shared memory for SUBTRANS - */ -Size -SUBTRANSShmemSize(void) -{ - return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0); -} - -void -SUBTRANSShmemInit(void) -{ - SubTransCtl->PagePrecedes = SubTransPagePrecedes; - SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0, - SubtransSLRULock, "pg_subtrans", - LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE); - SlruPagePrecedesUnitTests(SubTransCtl, SUBTRANS_XACTS_PER_PAGE); -} - /* * This func must be called ONCE on system install. It creates * the initial SUBTRANS segment. (The SUBTRANS directory is assumed to @@ -210,18 +181,16 @@ SUBTRANSShmemInit(void) void BootStrapSUBTRANS(void) { - int slotno; + Buffer buffer; - LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); + SlruPagePrecedesUnitTests(SubTransPagePrecedes, SUBTRANS_XACTS_PER_PAGE); /* Create and zero the first page of the subtrans log */ - slotno = ZeroSUBTRANSPage(0); + buffer = ZeroSUBTRANSPage(0); /* Make sure it's written out */ - SimpleLruWritePage(SubTransCtl, slotno); - Assert(!SubTransCtl->shared->page_dirty[slotno]); - - LWLockRelease(SubtransSLRULock); + FlushOneBuffer(buffer); + UnlockReleaseBuffer(buffer); } /* @@ -232,10 +201,19 @@ BootStrapSUBTRANS(void) * * Control lock must be held at entry, and will be held at exit. */ -static int +static Buffer ZeroSUBTRANSPage(int pageno) { - return SimpleLruZeroPage(SubTransCtl, pageno); + Buffer buffer; + Page page; + + buffer = ZeroSlruBuffer(SLRU_SUBTRANS_ID, pageno); + page = BufferGetPage(buffer); + PageInitSLRU(page, BLCKSZ, 0); + + MarkBufferDirty(buffer); + + return buffer; } /* @@ -258,7 +236,6 @@ StartupSUBTRANS(TransactionId oldestActiveXID) * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero * the new page without regard to whatever was previously on disk. */ - LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); startPage = TransactionIdToPage(oldestActiveXID); nextXid = ShmemVariableCache->nextXid; @@ -266,36 +243,15 @@ StartupSUBTRANS(TransactionId oldestActiveXID) while (startPage != endPage) { - (void) ZeroSUBTRANSPage(startPage); + UnlockReleaseBuffer(ZeroSUBTRANSPage(startPage)); startPage++; /* must account for wraparound */ if (startPage > TransactionIdToPage(MaxTransactionId)) startPage = 0; } - (void) ZeroSUBTRANSPage(startPage); - - LWLockRelease(SubtransSLRULock); -} - -/* - * Perform a checkpoint --- either during shutdown, or on-the-fly - */ -void -CheckPointSUBTRANS(void) -{ - /* - * Write dirty SUBTRANS pages to disk - * - * This is not actually necessary from a correctness point of view. We do - * it merely to improve the odds that writing of dirty pages is done by - * the checkpoint process and not by backends. - */ - TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true); - SimpleLruWriteAll(SubTransCtl, true); - TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true); + UnlockReleaseBuffer(ZeroSUBTRANSPage(startPage)); } - /* * Make sure that SUBTRANS has room for a newly-allocated XID. * @@ -319,12 +275,8 @@ ExtendSUBTRANS(TransactionId newestXact) pageno = TransactionIdToPage(newestXact); - LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE); - /* Zero the page */ - ZeroSUBTRANSPage(pageno); - - LWLockRelease(SubtransSLRULock); + UnlockReleaseBuffer(ZeroSUBTRANSPage(pageno)); } @@ -350,7 +302,7 @@ TruncateSUBTRANS(TransactionId oldestXact) TransactionIdRetreat(oldestXact); cutoffPage = TransactionIdToPage(oldestXact); - SimpleLruTruncate(SubTransCtl, cutoffPage); + SimpleLruTruncate(SLRU_SUBTRANS_ID, SubTransPagePrecedes, cutoffPage); } diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 8086b857b96..d792186cf97 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -67,6 +67,7 @@ #include "utils/inval.h" #include "utils/memutils.h" #include "utils/relmapper.h" +#include "utils/resowner_private.h" #include "utils/snapmgr.h" #include "utils/timeout.h" #include "utils/timestamp.h" @@ -1396,6 +1397,7 @@ RecordTransactionCommit(void) * are delaying the checkpoint a bit fuzzy, but it doesn't matter. */ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + START_CRIT_SECTION(); MyProc->delayChkptFlags |= DELAY_CHKPT_START; diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a31fbbff78d..9e2ac0d5392 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4645,6 +4645,7 @@ BootStrapXLOG(void) uint64 sysidentifier; struct timeval tv; pg_crc32c crc; + ResourceOwner resowner; /* allow ordinary WAL segment creation, like StartupXLOG() would */ SetInstallXLogFileSegmentActive(); @@ -4784,10 +4785,14 @@ BootStrapXLOG(void) WriteControlFile(); /* Bootstrap the commit log, too */ + resowner = ResourceOwnerCreate(NULL, "bootstrap resowner"); + CurrentResourceOwner = resowner; BootStrapCLOG(); BootStrapCommitTs(); BootStrapSUBTRANS(); BootStrapMultiXact(); + CurrentResourceOwner = NULL; + ResourceOwnerDelete(resowner); pfree(buffer); @@ -4796,6 +4801,8 @@ BootStrapXLOG(void) * otherwise never run the checks and GUC related initializations therein. */ ReadControlFile(); + + smgrcloseall(); } static char * @@ -7004,15 +7011,11 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointSnapBuild(); CheckPointLogicalRewriteHeap(); CheckPointReplicationOrigin(); + CheckPointPredicate(); - /* Write out all dirty data in SLRUs and the main buffer pool */ + /* Write out all dirty data in the buffer pool */ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags); CheckpointStats.ckpt_write_t = GetCurrentTimestamp(); - CheckPointCLOG(); - CheckPointCommitTs(); - CheckPointSUBTRANS(); - CheckPointMultiXact(); - CheckPointPredicate(); CheckPointBuffers(flags); /* Perform all queued up fsyncs */ diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 0cf03945eec..e7b2a2f9272 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -652,7 +652,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) { int block_id = prefetcher->next_block_id++; DecodedBkpBlock *block = &record->blocks[block_id]; - SMgrRelation reln; + SMgrFileHandle sfile; PrefetchBufferResult result; if (!block->in_use) @@ -722,7 +722,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) * same relation (with some scheme to handle invalidations * safely), but for now we'll call smgropen() every time. */ - reln = smgropen(block->rlocator, InvalidBackendId); + sfile = smgropen(block->rlocator, InvalidBackendId, block->forknum); /* * If the relation file doesn't exist on disk, for example because @@ -731,14 +731,14 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) * further prefetching in the relation until this record is * replayed. */ - if (!smgrexists(reln, MAIN_FORKNUM)) + if (!smgrexists(sfile)) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, "suppressing all prefetch in relation %u/%u/%u until %X/%X is replayed, because the relation does not exist on disk", - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber, + sfile->smgr_rlocator.locator.spcOid, + sfile->smgr_rlocator.locator.dbOid, + sfile->smgr_rlocator.locator.relNumber, LSN_FORMAT_ARGS(record->lsn)); #endif XLogPrefetcherAddFilter(prefetcher, block->rlocator, 0, @@ -752,14 +752,14 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) * block yet, suppress prefetching of this block and higher until * this record is replayed. */ - if (block->blkno >= smgrnblocks(reln, block->forknum)) + if (block->blkno >= smgrnblocks(sfile)) { #ifdef XLOGPREFETCHER_DEBUG_LEVEL elog(XLOGPREFETCHER_DEBUG_LEVEL, "suppressing prefetch in relation %u/%u/%u from block %u until %X/%X is replayed, because the relation is too small", - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber, + sfile->smgr_rlocator.locator.spcOid, + sfile->smgr_rlocator.locator.dbOid, + sfile->smgr_rlocator.locator.relNumber, block->blkno, LSN_FORMAT_ARGS(record->lsn)); #endif @@ -770,7 +770,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) } /* Try to initiate prefetching. */ - result = PrefetchSharedBuffer(reln, block->forknum, block->blkno); + result = PrefetchSharedBuffer(sfile, block->blkno); if (BufferIsValid(result.recent_buffer)) { /* Cache hit, nothing to do. */ @@ -796,9 +796,9 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) */ elog(ERROR, "could not prefetch relation %u/%u/%u block %u", - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber, + sfile->smgr_locator.locator.spcOid, + sfile->smgr_locator.locator.dbOid, + sfile->smgr_locator.locator.relNumber, block->blkno); } } diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 563cba258dd..ab59bfe66c9 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -477,7 +477,7 @@ XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, { BlockNumber lastblock; Buffer buffer; - SMgrRelation smgr; + SMgrFileHandle sfile; Assert(blkno != P_NEW); @@ -491,7 +491,7 @@ XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, } /* Open the relation at smgr level */ - smgr = smgropen(rlocator, InvalidBackendId); + sfile = smgropen(rlocator, InvalidBackendId, forknum); /* * Create the target file if it doesn't already exist. This lets us cope @@ -501,9 +501,9 @@ XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, * filesystem loses an inode during a crash. Better to write the data * until we are actually told to delete the file.) */ - smgrcreate(smgr, forknum, true); + smgrcreate(sfile, true); - lastblock = smgrnblocks(smgr, forknum); + lastblock = smgrnblocks(sfile); if (blkno < lastblock) { @@ -631,7 +631,7 @@ CreateFakeRelcacheEntry(RelFileLocator rlocator) rel->rd_lockInfo.lockRelId.dbId = rlocator.dbOid; rel->rd_lockInfo.lockRelId.relId = rlocator.relNumber; - rel->rd_smgr = NULL; + MemSet(rel->rd_smgr, 0, sizeof(rel->rd_smgr)); return rel; } @@ -643,8 +643,11 @@ void FreeFakeRelcacheEntry(Relation fakerel) { /* make sure the fakerel is not referenced by the SmgrRelation anymore */ - if (fakerel->rd_smgr != NULL) - smgrclearowner(&fakerel->rd_smgr, fakerel->rd_smgr); + for (int i = 0; i <= MAX_FORKNUM; i++) + { + if (fakerel->rd_smgr[i] != NULL) + smgrclearowner(&fakerel->rd_smgr[i], fakerel->rd_smgr[i]); + } pfree(fakerel); } diff --git a/src/backend/catalog/catalog.c b/src/backend/catalog/catalog.c index 2abd6b007a2..f536cfc2a9b 100644 --- a/src/backend/catalog/catalog.c +++ b/src/backend/catalog/catalog.c @@ -501,10 +501,10 @@ GetNewOidWithIndex(Relation relation, Oid indexId, AttrNumber oidcolumn) RelFileNumber GetNewRelFileNumber(Oid reltablespace, Relation pg_class, char relpersistence) { - RelFileLocatorBackend rlocator; - char *rpath; - bool collides; + RelFileLocator rlocator; BackendId backend; + SMgrFileHandle sfile; + bool collides; /* * If we ever get here during pg_upgrade, there's something wrong; all @@ -513,6 +513,11 @@ GetNewRelFileNumber(Oid reltablespace, Relation pg_class, char relpersistence) */ Assert(!IsBinaryUpgrade); + /* + * The relpath will vary based on the backend ID, so we must initialize + * that properly here to make sure that any collisions based on filename + * are properly detected. + */ switch (relpersistence) { case RELPERSISTENCE_TEMP: @@ -528,53 +533,29 @@ GetNewRelFileNumber(Oid reltablespace, Relation pg_class, char relpersistence) } /* This logic should match RelationInitPhysicalAddr */ - rlocator.locator.spcOid = reltablespace ? reltablespace : MyDatabaseTableSpace; - rlocator.locator.dbOid = - (rlocator.locator.spcOid == GLOBALTABLESPACE_OID) ? + rlocator.spcOid = reltablespace ? reltablespace : MyDatabaseTableSpace; + rlocator.dbOid = + (rlocator.spcOid == GLOBALTABLESPACE_OID) ? InvalidOid : MyDatabaseId; - /* - * The relpath will vary based on the backend ID, so we must initialize - * that properly here to make sure that any collisions based on filename - * are properly detected. - */ - rlocator.backend = backend; - do { CHECK_FOR_INTERRUPTS(); /* Generate the OID */ if (pg_class) - rlocator.locator.relNumber = GetNewOidWithIndex(pg_class, ClassOidIndexId, + rlocator.relNumber = GetNewOidWithIndex(pg_class, ClassOidIndexId, Anum_pg_class_oid); else - rlocator.locator.relNumber = GetNewObjectId(); + rlocator.relNumber = GetNewObjectId(); /* Check for existing file of same name */ - rpath = relpath(rlocator, MAIN_FORKNUM); - - if (access(rpath, F_OK) == 0) - { - /* definite collision */ - collides = true; - } - else - { - /* - * Here we have a little bit of a dilemma: if errno is something - * other than ENOENT, should we declare a collision and loop? In - * practice it seems best to go ahead regardless of the errno. If - * there is a colliding file we will get an smgr failure when we - * attempt to create the new relation file. - */ - collides = false; - } - - pfree(rpath); + sfile = smgropen(rlocator, backend, MAIN_FORKNUM); + collides = smgrexists(sfile); + smgrclose(sfile); } while (collides); - return rlocator.locator.relNumber; + return rlocator.relNumber; } /* diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 61f1d3926a9..8ae943c4914 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3029,9 +3029,9 @@ index_build(Relation heapRelation, * relfilenumber won't change, and nothing needs to be done here. */ if (indexRelation->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && - !smgrexists(RelationGetSmgr(indexRelation), INIT_FORKNUM)) + !smgrexists(RelationGetSmgr(indexRelation, INIT_FORKNUM))) { - smgrcreate(RelationGetSmgr(indexRelation), INIT_FORKNUM, false); + smgrcreate(RelationGetSmgr(indexRelation, INIT_FORKNUM), false); indexRelation->rd_indam->ambuildempty(indexRelation); } diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index d708af19ed2..5d47864e3a9 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -116,11 +116,11 @@ AddPendingSync(const RelFileLocator *rlocator) * that does not want the storage to be destroyed in case of an abort may * pass register_delete = false. */ -SMgrRelation +SMgrFileHandle RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete) { - SMgrRelation srel; + SMgrFileHandle sfile; BackendId backend; bool needs_wal; @@ -145,11 +145,11 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, return NULL; /* placate compiler */ } - srel = smgropen(rlocator, backend); - smgrcreate(srel, MAIN_FORKNUM, false); + sfile = smgropen(rlocator, backend, MAIN_FORKNUM); + smgrcreate(sfile, false); if (needs_wal) - log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM); + log_smgrcreate(&rlocator, MAIN_FORKNUM); /* * Add the relation to the list of stuff to delete at abort, if we are @@ -175,7 +175,7 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, AddPendingSync(&rlocator); } - return srel; + return sfile; } /* @@ -292,16 +292,18 @@ RelationTruncate(Relation rel, BlockNumber nblocks) ForkNumber forks[MAX_FORKNUM]; BlockNumber blocks[MAX_FORKNUM]; int nforks = 0; - SMgrRelation reln; /* * Make sure smgr_targblock etc aren't pointing somewhere past new end. * (Note: don't rely on this reln pointer below this loop.) */ - reln = RelationGetSmgr(rel); - reln->smgr_targblock = InvalidBlockNumber; - for (int i = 0; i <= MAX_FORKNUM; ++i) - reln->smgr_cached_nblocks[i] = InvalidBlockNumber; + for (int i = 0; i <= MAX_FORKNUM; i++) + { + SMgrFileHandle sfile = RelationGetSmgr(rel, i); + + sfile->smgr_targblock = InvalidBlockNumber; + sfile->smgr_cached_nblocks = InvalidBlockNumber; + } /* Prepare for truncation of MAIN fork of the relation */ forks[nforks] = MAIN_FORKNUM; @@ -309,7 +311,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks) nforks++; /* Prepare for truncation of the FSM if it exists */ - fsm = smgrexists(RelationGetSmgr(rel), FSM_FORKNUM); + fsm = smgrexists(RelationGetSmgr(rel, FSM_FORKNUM)); if (fsm) { blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, nblocks); @@ -322,7 +324,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks) } /* Prepare for truncation of the visibility map too if it exists */ - vm = smgrexists(RelationGetSmgr(rel), VISIBILITYMAP_FORKNUM); + vm = smgrexists(RelationGetSmgr(rel, VISIBILITYMAP_FORKNUM)); if (vm) { blocks[nforks] = visibilitymap_prepare_truncate(rel, nblocks); @@ -390,11 +392,12 @@ RelationTruncate(Relation rel, BlockNumber nblocks) } /* - * This will first remove any buffers from the buffer pool that should no + * First remove any buffers from the buffer pool that should no * longer exist after truncation is complete, and then truncate the * corresponding files on disk. */ - smgrtruncate(RelationGetSmgr(rel), forks, nforks, blocks); + DropRelationBuffers(rel->rd_locator, rel->rd_backend, forks, nforks, blocks); + smgrtruncate_multi(rel->rd_locator, rel->rd_backend, forks, nforks, blocks); /* We've done all the critical work, so checkpoints are OK now. */ MyProc->delayChkptFlags &= ~DELAY_CHKPT_COMPLETE; @@ -428,7 +431,7 @@ RelationPreTruncate(Relation rel) return; pending = hash_search(pendingSyncHash, - &(RelationGetSmgr(rel)->smgr_rlocator.locator), + &rel->rd_locator, HASH_FIND, NULL); if (pending) pending->is_truncated = true; @@ -444,12 +447,12 @@ RelationPreTruncate(Relation rel) * Also note that this is frequently called via locutions such as * RelationCopyStorage(RelationGetSmgr(rel), ...); * That's safe only because we perform only smgr and WAL operations here. - * If we invoked anything else, a relcache flush could cause our SMgrRelation + * If we invoked anything else, a relcache flush could cause our SMgrFileHandle * argument to become a dangling pointer. */ void -RelationCopyStorage(SMgrRelation src, SMgrRelation dst, - ForkNumber forkNum, char relpersistence) +RelationCopyStorage(SMgrFileHandle src, SMgrFileHandle dst, + char relpersistence) { PGAlignedBlock buf; Page page; @@ -466,7 +469,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, * it needs to be synced to disk. */ copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED && - forkNum == INIT_FORKNUM; + src->smgr_locator.forknum == INIT_FORKNUM; /* * We need to log the copied data in WAL iff WAL archiving/streaming is @@ -477,14 +480,14 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, use_wal = XLogIsNeeded() && (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork); - nblocks = smgrnblocks(src, forkNum); + nblocks = smgrnblocks(src); for (blkno = 0; blkno < nblocks; blkno++) { /* If we got a cancel signal during the copy of the data, quit */ CHECK_FOR_INTERRUPTS(); - smgrread(src, forkNum, blkno, buf.data); + smgrread(src, blkno, buf.data); if (!PageIsVerifiedExtended(page, blkno, PIV_LOG_WARNING | PIV_REPORT_STAT)) @@ -496,9 +499,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, * (errcontext callbacks shouldn't be risking any such thing, but * people have been known to forget that rule.) */ - char *relpath = relpathbackend(src->smgr_rlocator.locator, - src->smgr_rlocator.backend, - forkNum); + char *relpath = smgrfilepath(src->smgr_locator); ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -512,7 +513,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, * space. */ if (use_wal) - log_newpage(&dst->smgr_rlocator.locator, forkNum, blkno, page, false); + log_newpage(&dst->smgr_locator.locator, dst->smgr_locator.forknum, blkno, page, false); PageSetChecksumInplace(page, blkno); @@ -521,7 +522,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, * need for smgr to schedule an fsync for this write; we'll do it * ourselves below. */ - smgrextend(dst, forkNum, blkno, buf.data, true); + smgrextend(dst, blkno, buf.data, true); } /* @@ -534,7 +535,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst, * they might still not be on disk when the crash occurs. */ if (use_wal || copying_initfork) - smgrimmedsync(dst, forkNum); + smgrimmedsync(dst); } /* @@ -653,9 +654,9 @@ smgrDoPendingDeletes(bool isCommit) PendingRelDelete *pending; PendingRelDelete *prev; PendingRelDelete *next; - int nrels = 0, - maxrels = 0; - SMgrRelation *srels = NULL; + int nlocators = 0, + maxlocators = 0; + RelFileLocatorBackend *locators = NULL; prev = NULL; for (pending = pendingDeletes; pending != NULL; pending = next) @@ -676,23 +677,21 @@ smgrDoPendingDeletes(bool isCommit) /* do deletion if called for */ if (pending->atCommit == isCommit) { - SMgrRelation srel; - - srel = smgropen(pending->rlocator, pending->backend); + RelFileLocatorBackend rlocator = { pending->rlocator, pending->backend }; /* allocate the initial array, or extend it, if needed */ - if (maxrels == 0) + if (maxlocators == 0) { - maxrels = 8; - srels = palloc(sizeof(SMgrRelation) * maxrels); + maxlocators = 8; + locators = palloc(sizeof(RelFileLocatorBackend) * maxlocators); } - else if (maxrels <= nrels) + else if (maxlocators <= nlocators) { - maxrels *= 2; - srels = repalloc(srels, sizeof(SMgrRelation) * maxrels); + maxlocators *= 2; + locators = repalloc(locators, sizeof(RelFileLocatorBackend) * maxlocators); } - srels[nrels++] = srel; + locators[nlocators++] = rlocator; } /* must explicitly free the list entry */ pfree(pending); @@ -700,15 +699,58 @@ smgrDoPendingDeletes(bool isCommit) } } - if (nrels > 0) + if (nlocators > 0) { - smgrdounlinkall(srels, nrels, false); + ForkNumber forks[MAX_FORKNUM + 1]; + + for (int i = 0; i <= MAX_FORKNUM; i++) + forks[i] = i; + + /* + * Get rid of any remaining buffers for the relations. bufmgr will just + * drop them without bothering to write the contents. + */ + DropRelationsAllBuffers(locators, nlocators); - for (int i = 0; i < nrels; i++) - smgrclose(srels[i]); + for (int i = 0; i < nlocators; i++) + smgrunlink_multi(locators[i].locator, locators[i].backend, forks, MAX_FORKNUM + 1, false); + pfree(locators); + } +} + +/* + * DropRelationFiles -- drop files of all given relations + */ +void +DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo) +{ + RelFileLocatorBackend *locators; + int i; + ForkNumber all_forks[MAX_FORKNUM + 1]; - pfree(srels); + locators = palloc(sizeof(RelFileLocatorBackend) * ndelrels); + for (i = 0; i < ndelrels; i++) + { + if (isRedo) + { + for (int fork = 0; fork <= MAX_FORKNUM; fork++) + XLogDropRelation(delrels[i], fork); + } + locators[i].locator = delrels[i]; + locators[i].backend = InvalidBackendId; } + + /* + * Get rid of any remaining buffers for the relations. bufmgr will just + * drop them without bothering to write the contents. + */ + DropRelationsAllBuffers(locators, ndelrels); + + for (int fork = 0; fork <= MAX_FORKNUM; fork++) + all_forks[fork] = fork; + + for (i = 0; i < ndelrels; i++) + smgrunlink_multi(locators[i].locator, locators[i].backend, all_forks, MAX_FORKNUM + 1, true); } /* @@ -718,9 +760,9 @@ void smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) { PendingRelDelete *pending; - int nrels = 0, - maxrels = 0; - SMgrRelation *srels = NULL; + int nlocators = 0, + maxlocators = 0; + RelFileLocator *locators = NULL; HASH_SEQ_STATUS scan; PendingRelSync *pendingsync; @@ -757,9 +799,6 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) ForkNumber fork; BlockNumber nblocks[MAX_FORKNUM + 1]; BlockNumber total_blocks = 0; - SMgrRelation srel; - - srel = smgropen(pendingsync->rlocator, InvalidBackendId); /* * We emit newpage WAL records for smaller relations. @@ -773,9 +812,12 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) { for (fork = 0; fork <= MAX_FORKNUM; fork++) { - if (smgrexists(srel, fork)) + SMgrFileHandle sfile; + + sfile = smgropen(pendingsync->rlocator, InvalidBackendId, fork); + if (smgrexists(sfile)) { - BlockNumber n = smgrnblocks(srel, fork); + BlockNumber n = smgrnblocks(sfile); /* we shouldn't come here for unlogged relations */ Assert(fork != INIT_FORKNUM); @@ -803,18 +845,19 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) total_blocks * BLCKSZ / 1024 >= wal_skip_threshold) { /* allocate the initial array, or extend it, if needed */ - if (maxrels == 0) + if (maxlocators == 0) { - maxrels = 8; - srels = palloc(sizeof(SMgrRelation) * maxrels); + maxlocators = 8; + locators = palloc(sizeof(RelFileLocatorBackend) * maxlocators); } - else if (maxrels <= nrels) + else if (maxlocators <= nlocators) { - maxrels *= 2; - srels = repalloc(srels, sizeof(SMgrRelation) * maxrels); + maxlocators *= 2; + locators = repalloc(locators, sizeof(RelFileLocatorBackend) * maxlocators); } - srels[nrels++] = srel; + locators[nlocators] = pendingsync->rlocator; + nlocators++; } else { @@ -833,7 +876,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) * page including any unused space. ReadBufferExtended() * counts some pgstat events; unfortunately, we discard them. */ - rel = CreateFakeRelcacheEntry(srel->smgr_rlocator.locator); + rel = CreateFakeRelcacheEntry(pendingsync->rlocator); log_newpage_range(rel, fork, 0, n, false); FreeFakeRelcacheEntry(rel); } @@ -842,11 +885,20 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) pendingSyncHash = NULL; - if (nrels > 0) + FlushRelationsAllBuffers(locators, nlocators); + + for (int i = 0; i < nlocators; i++) { - smgrdosyncall(srels, nrels); - pfree(srels); + for (int fork = 0; fork <= MAX_FORKNUM; fork++) + { + SMgrFileHandle sfile = smgropen(locators[i], InvalidBackendId, fork); + + if (smgrexists(sfile)) + smgrimmedsync(sfile); + } } + if (locators != NULL) + pfree(locators); } /* @@ -966,22 +1018,22 @@ smgr_redo(XLogReaderState *record) if (info == XLOG_SMGR_CREATE) { xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); - SMgrRelation reln; + SMgrFileHandle sfile; - reln = smgropen(xlrec->rlocator, InvalidBackendId); - smgrcreate(reln, xlrec->forkNum, true); + sfile = smgropen(xlrec->rlocator, InvalidBackendId, xlrec->forkNum); + smgrcreate(sfile, true); } else if (info == XLOG_SMGR_TRUNCATE) { xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); - SMgrRelation reln; + SMgrFileHandle sfile; Relation rel; ForkNumber forks[MAX_FORKNUM]; BlockNumber blocks[MAX_FORKNUM]; int nforks = 0; bool need_fsm_vacuum = false; - reln = smgropen(xlrec->rlocator, InvalidBackendId); + sfile = smgropen(xlrec->rlocator, InvalidBackendId, MAIN_FORKNUM); /* * Forcibly create relation if it doesn't exist (which suggests that @@ -989,7 +1041,7 @@ smgr_redo(XLogReaderState *record) * XLogReadBufferForRedo, we prefer to recreate the rel and replay the * log as best we can until the drop is seen. */ - smgrcreate(reln, MAIN_FORKNUM, true); + smgrcreate(sfile, true); /* * Before we perform the truncation, update minimum recovery point to @@ -1022,8 +1074,10 @@ smgr_redo(XLogReaderState *record) /* Prepare for truncation of FSM and VM too */ rel = CreateFakeRelcacheEntry(xlrec->rlocator); + DropRelationBuffers(xlrec->rlocator, InvalidBackendId, forks, nforks, blocks); + if ((xlrec->flags & SMGR_TRUNCATE_FSM) != 0 && - smgrexists(reln, FSM_FORKNUM)) + smgrexists(smgropen(xlrec->rlocator, InvalidBackendId, FSM_FORKNUM))) { blocks[nforks] = FreeSpaceMapPrepareTruncateRel(rel, xlrec->blkno); if (BlockNumberIsValid(blocks[nforks])) @@ -1034,7 +1088,7 @@ smgr_redo(XLogReaderState *record) } } if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0 && - smgrexists(reln, VISIBILITYMAP_FORKNUM)) + smgrexists(smgropen(xlrec->rlocator, InvalidBackendId, VISIBILITYMAP_FORKNUM))) { blocks[nforks] = visibilitymap_prepare_truncate(rel, xlrec->blkno); if (BlockNumberIsValid(blocks[nforks])) @@ -1046,7 +1100,10 @@ smgr_redo(XLogReaderState *record) /* Do the real work to truncate relation forks */ if (nforks > 0) - smgrtruncate(reln, forks, nforks, blocks); + { + DropRelationBuffers(xlrec->rlocator, InvalidBackendId, forks, nforks, blocks); + smgrtruncate_multi(xlrec->rlocator, InvalidBackendId, forks, nforks, blocks); + } /* * Update upper-level FSM pages to account for the truncation. This is diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 3e1b92df030..b9e362ccab0 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -141,6 +141,7 @@ #include "libpq/libpq.h" #include "libpq/pqformat.h" #include "miscadmin.h" +#include "storage/bufmgr.h" #include "storage/ipc.h" #include "storage/lmgr.h" #include "storage/proc.h" @@ -163,7 +164,7 @@ * than that, so changes in that data structure won't affect user-visible * restrictions. */ -#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128) +#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - SizeOfPageHeaderData - 128) /* * Struct representing an entry in the global notify queue @@ -213,7 +214,7 @@ typedef struct QueuePosition ((x).page == (y).page && (x).offset == (y).offset) #define QUEUE_POS_IS_ZERO(x) \ - ((x).page == 0 && (x).offset == 0) + ((x).page == 0 && (x).offset == MAXALIGN(SizeOfPageHeaderData)) /* choose logically smaller QueuePosition */ #define QUEUE_POS_MIN(x,y) \ @@ -305,12 +306,6 @@ static AsyncQueueControl *asyncQueueControl; #define QUEUE_NEXT_LISTENER(i) (asyncQueueControl->backend[i].nextListener) #define QUEUE_BACKEND_POS(i) (asyncQueueControl->backend[i].pos) -/* - * The SLRU buffer area through which we access the notification queue - */ -static SlruCtlData NotifyCtlData; - -#define NotifyCtl (&NotifyCtlData) #define QUEUE_PAGESIZE BLCKSZ #define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */ @@ -521,8 +516,6 @@ AsyncShmemSize(void) size = mul_size(MaxBackends + 1, sizeof(QueueBackendStatus)); size = add_size(size, offsetof(AsyncQueueControl, backend)); - size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS, 0)); - return size; } @@ -550,8 +543,8 @@ AsyncShmemInit(void) if (!found) { /* First time through, so initialize it */ - SET_QUEUE_POS(QUEUE_HEAD, 0, 0); - SET_QUEUE_POS(QUEUE_TAIL, 0, 0); + SET_QUEUE_POS(QUEUE_HEAD, 0, MAXALIGN(SizeOfPageHeaderData)); + SET_QUEUE_POS(QUEUE_TAIL, 0, MAXALIGN(SizeOfPageHeaderData)); QUEUE_STOP_PAGE = 0; QUEUE_FIRST_LISTENER = InvalidBackendId; asyncQueueControl->lastQueueFillWarn = 0; @@ -561,24 +554,17 @@ AsyncShmemInit(void) QUEUE_BACKEND_PID(i) = InvalidPid; QUEUE_BACKEND_DBOID(i) = InvalidOid; QUEUE_NEXT_LISTENER(i) = InvalidBackendId; - SET_QUEUE_POS(QUEUE_BACKEND_POS(i), 0, 0); + SET_QUEUE_POS(QUEUE_BACKEND_POS(i), 0, MAXALIGN(SizeOfPageHeaderData)); } } - /* - * Set up SLRU management of the pg_notify data. - */ - NotifyCtl->PagePrecedes = asyncQueuePagePrecedes; - SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0, - NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER, - SYNC_HANDLER_NONE); - if (!found) { /* * During start or reboot, clean out the pg_notify directory. */ - (void) SlruScanDirectory(NotifyCtl, SlruScanDirCbDeleteAll, NULL); + (void) SlruScanDirectory(SLRU_NOTIFY_ID, asyncQueuePagePrecedes, + SlruScanDirCbDeleteAll, NULL); } } @@ -1345,19 +1331,19 @@ asyncQueueAdvance(volatile QueuePosition *position, int entryLength) * written or read. */ offset += entryLength; - Assert(offset <= QUEUE_PAGESIZE); + Assert(offset <= QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData)); /* * In a second step check if another entry can possibly be written to the * page. If so, stay here, we have reached the next position. If not, then * we need to move on to the next page. */ - if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGESIZE) + if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData)) { pageno++; if (pageno > QUEUE_MAX_PAGE) pageno = 0; /* wrap around */ - offset = 0; + offset = MAXALIGN(SizeOfPageHeaderData); /* start at SizeOfPageHeaderData */ pageJump = true; } @@ -1411,10 +1397,7 @@ asyncQueueAddEntries(ListCell *nextNotify) QueuePosition queue_head; int pageno; int offset; - int slotno; - - /* We hold both NotifyQueueLock and NotifySLRULock during this operation */ - LWLockAcquire(NotifySLRULock, LW_EXCLUSIVE); + Buffer buffer; /* * We work with a local copy of QUEUE_HEAD, which we write back to shared @@ -1439,13 +1422,20 @@ asyncQueueAddEntries(ListCell *nextNotify) */ pageno = QUEUE_POS_PAGE(queue_head); if (QUEUE_POS_IS_ZERO(queue_head)) - slotno = SimpleLruZeroPage(NotifyCtl, pageno); + { + buffer = ZeroSlruBuffer(SLRU_NOTIFY_ID, pageno); + PageSetHeaderDataNonRel(BufferGetPage(buffer), pageno, InvalidXLogRecPtr, BLCKSZ, PG_METAPAGE_LAYOUT_VERSION); + + } else - slotno = SimpleLruReadPage(NotifyCtl, pageno, true, - InvalidTransactionId); + { + buffer = ReadSlruBuffer(SLRU_NOTIFY_ID, pageno, RBM_NORMAL); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } + /* Note we mark the page dirty before writing in it */ - NotifyCtl->shared->page_dirty[slotno] = true; + MarkBufferDirty(buffer); while (nextNotify != NULL) { @@ -1457,7 +1447,7 @@ asyncQueueAddEntries(ListCell *nextNotify) offset = QUEUE_POS_OFFSET(queue_head); /* Check whether the entry really fits on the current page */ - if (offset + qe.length <= QUEUE_PAGESIZE) + if (offset + qe.length <= QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData)) { /* OK, so advance nextNotify past this item */ nextNotify = lnext(pendingNotifies->events, nextNotify); @@ -1469,17 +1459,18 @@ asyncQueueAddEntries(ListCell *nextNotify) * only check dboid and since it won't match any reader's database * OID, they will ignore this entry and move on. */ - qe.length = QUEUE_PAGESIZE - offset; + qe.length = QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData) - offset; qe.dboid = InvalidOid; qe.data[0] = '\0'; /* empty channel */ qe.data[1] = '\0'; /* empty payload */ } /* Now copy qe into the shared buffer page */ - memcpy(NotifyCtl->shared->page_buffer[slotno] + offset, + memcpy(PageGetContents(BufferGetPage(buffer)) + offset, &qe, qe.length); + /* Advance queue_head appropriately, and detect if page is full */ if (asyncQueueAdvance(&(queue_head), qe.length)) { @@ -1491,7 +1482,10 @@ asyncQueueAddEntries(ListCell *nextNotify) * asyncQueueIsFull() ensured that there is room to create this * page without overrunning the queue. */ - slotno = SimpleLruZeroPage(NotifyCtl, QUEUE_POS_PAGE(queue_head)); + UnlockReleaseBuffer(buffer); + buffer = ZeroSlruBuffer(SLRU_NOTIFY_ID, + QUEUE_POS_PAGE(queue_head)); + MarkBufferDirty(buffer); /* * If the new page address is a multiple of QUEUE_CLEANUP_DELAY, @@ -1505,12 +1499,11 @@ asyncQueueAddEntries(ListCell *nextNotify) break; } } + UnlockReleaseBuffer(buffer); /* Success, so update the global QUEUE_HEAD */ QUEUE_HEAD = queue_head; - LWLockRelease(NotifySLRULock); - return nextNotify; } @@ -1983,17 +1976,16 @@ asyncQueueReadAllNotifications(void) { int curpage = QUEUE_POS_PAGE(pos); int curoffset = QUEUE_POS_OFFSET(pos); - int slotno; int copysize; + Buffer buffer; /* - * We copy the data from SLRU into a local buffer, so as to avoid - * holding the NotifySLRULock while we are examining the entries - * and possibly transmitting them to our frontend. Copy only the - * part of the page we will actually inspect. + * We copy the data into a local buffer, so as to avoid holding a + * buffer pin while we are examining the entries and possibly + * transmitting them to our frontend. Copy only the part of the + * page we will actually inspect. */ - slotno = SimpleLruReadPage_ReadOnly(NotifyCtl, curpage, - InvalidTransactionId); + buffer = ReadSlruBuffer(SLRU_NOTIFY_ID, curpage, RBM_NORMAL); if (curpage == QUEUE_POS_PAGE(head)) { /* we only want to read as far as head */ @@ -2004,13 +1996,12 @@ asyncQueueReadAllNotifications(void) else { /* fetch all the rest of the page */ - copysize = QUEUE_PAGESIZE - curoffset; + copysize = QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData) - curoffset; } - memcpy(page_buffer.buf + curoffset, - NotifyCtl->shared->page_buffer[slotno] + curoffset, + memcpy(PageGetContents(page_buffer.buf) + curoffset, + PageGetContents(BufferGetPage(buffer)) + curoffset, copysize); - /* Release lock that we got from SimpleLruReadPage_ReadOnly() */ - LWLockRelease(NotifySLRULock); + ReleaseBuffer(buffer); /* * Process messages up to the stop position, end of page, or an @@ -2078,7 +2069,7 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current, if (QUEUE_POS_EQUAL(thisentry, stop)) break; - qe = (AsyncQueueEntry *) (page_buffer + QUEUE_POS_OFFSET(thisentry)); + qe = (AsyncQueueEntry *) (PageGetContents(page_buffer) + QUEUE_POS_OFFSET(thisentry)); /* * Advance *current over this message, possibly to the next page. As @@ -2207,7 +2198,7 @@ asyncQueueAdvanceTail(void) * SimpleLruTruncate() will ask for NotifySLRULock but will also * release the lock again. */ - SimpleLruTruncate(NotifyCtl, newtailpage); + SimpleLruTruncate(SLRU_NOTIFY_ID, asyncQueuePagePrecedes, newtailpage); /* * Update QUEUE_STOP_PAGE. This changes asyncQueueIsFull()'s verdict diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 6eb87427181..70241e6f4c7 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -259,7 +259,7 @@ ScanSourceDatabasePgClass(Oid tbid, Oid dbid, char *srcpath) List *rlocatorlist = NIL; LockRelId relid; Snapshot snapshot; - SMgrRelation smgr; + SMgrFileHandle sfile; BufferAccessStrategy bstrategy; /* Get pg_class relfilenumber. */ @@ -276,9 +276,9 @@ ScanSourceDatabasePgClass(Oid tbid, Oid dbid, char *srcpath) rlocator.dbOid = dbid; rlocator.relNumber = relfilenumber; - smgr = smgropen(rlocator, InvalidBackendId); - nblocks = smgrnblocks(smgr, MAIN_FORKNUM); - smgrclose(smgr); + sfile = smgropen(rlocator, InvalidBackendId, MAIN_FORKNUM); + nblocks = smgrnblocks(sfile); + smgrclose(sfile); /* Use a buffer access strategy since this is a bulk read operation. */ bstrategy = GetAccessStrategy(BAS_BULKREAD); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index 99c9f91cba5..ee11ef63c0e 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -353,14 +353,14 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) { - SMgrRelation srel; + SMgrFileHandle sfile; - srel = smgropen(rel->rd_locator, InvalidBackendId); - smgrcreate(srel, INIT_FORKNUM, false); + sfile = smgropen(rel->rd_locator, InvalidBackendId, INIT_FORKNUM); + smgrcreate(sfile, false); log_smgrcreate(&rel->rd_locator, INIT_FORKNUM); fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM); FlushRelationBuffers(rel); - smgrclose(srel); + smgrclose(sfile); } } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index ee88e87d76d..06862dfa346 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -14675,9 +14675,7 @@ AlterTableMoveAll(AlterTableMoveAllStmt *stmt) static void index_copy_data(Relation rel, RelFileLocator newrlocator) { - SMgrRelation dstrel; - - dstrel = smgropen(newrlocator, rel->rd_backend); + SMgrFileHandle dstmain; /* * Since we copy the file directly without looking at the shared buffers, @@ -14697,16 +14695,20 @@ index_copy_data(Relation rel, RelFileLocator newrlocator) RelationCreateStorage(newrlocator, rel->rd_rel->relpersistence, true); /* copy main fork */ - RelationCopyStorage(RelationGetSmgr(rel), dstrel, MAIN_FORKNUM, + dstmain = smgropen(newrlocator, rel->rd_backend, MAIN_FORKNUM); + RelationCopyStorage(RelationGetSmgr(rel, MAIN_FORKNUM), dstmain, rel->rd_rel->relpersistence); /* copy those extra forks that exist */ for (ForkNumber forkNum = MAIN_FORKNUM + 1; forkNum <= MAX_FORKNUM; forkNum++) { - if (smgrexists(RelationGetSmgr(rel), forkNum)) + if (smgrexists(RelationGetSmgr(rel, forkNum))) { - smgrcreate(dstrel, forkNum, false); + SMgrFileHandle src_fork = RelationGetSmgr(rel, forkNum); + SMgrFileHandle dst_fork = smgropen(newrlocator, rel->rd_backend, forkNum); + + smgrcreate(dst_fork, false); /* * WAL log creation if the relation is persistent, or this is the @@ -14716,14 +14718,15 @@ index_copy_data(Relation rel, RelFileLocator newrlocator) (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED && forkNum == INIT_FORKNUM)) log_smgrcreate(&newrlocator, forkNum); - RelationCopyStorage(RelationGetSmgr(rel), dstrel, forkNum, + RelationCopyStorage(src_fork, dst_fork, rel->rd_rel->relpersistence); + smgrclose(dst_fork); } } /* drop old relation, and close new one */ RelationDropStorage(rel); - smgrclose(dstrel); + smgrclose(dstmain); } /* diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 6b6264854e6..41819c590e0 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -20,6 +20,7 @@ BufferDescPadded *BufferDescriptors; char *BufferBlocks; +XLogRecPtr *BufferExternalLSNs; ConditionVariableMinimallyPadded *BufferIOCVArray; WritebackContext BackendWritebackContext; CkptSortItem *CkptBufferIds; @@ -69,9 +70,11 @@ InitBufferPool(void) { bool foundBufs, foundDescs, + foundLSNs, foundIOCV, foundBufCkpt; + /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) ShmemInitStruct("Buffer Descriptors", @@ -88,6 +91,11 @@ InitBufferPool(void) NBuffers * sizeof(ConditionVariableMinimallyPadded), &foundIOCV); + BufferExternalLSNs = (XLogRecPtr *) + ShmemInitStruct("Buffer External LSNs", + NBuffers * sizeof(XLogRecPtr), + &foundLSNs); + /* * The array used to sort to-be-checkpointed buffer ids is located in * shared memory, to avoid having to allocate significant amounts of @@ -99,10 +107,10 @@ InitBufferPool(void) ShmemInitStruct("Checkpoint BufferIds", NBuffers * sizeof(CkptSortItem), &foundBufCkpt); - if (foundDescs || foundBufs || foundIOCV || foundBufCkpt) + if (foundDescs || foundBufs || foundIOCV || foundBufCkpt || foundLSNs) { /* should find all of these, or none of them */ - Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt); + Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt && foundLSNs); /* note: this path is only taken in EXEC_BACKEND case */ } else @@ -133,6 +141,8 @@ InitBufferPool(void) LWTRANCHE_BUFFER_CONTENT); ConditionVariableInit(BufferDescriptorGetIOCV(buf)); + + BufferExternalLSNs[i] = InvalidXLogRecPtr; } /* Correct last entry of linked list */ @@ -166,6 +176,9 @@ BufferShmemSize(void) /* size of data pages */ size = add_size(size, mul_size(NBuffers, BLCKSZ)); + /* size of external LSNs */ + size = add_size(size, mul_size(NBuffers, sizeof(XLogRecPtr))); + /* size of stuff controlled by freelist.c */ size = add_size(size, StrategyShmemSize()); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 73d30bf6191..8154e4b013f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -57,10 +57,17 @@ #include "utils/resowner_private.h" #include "utils/timestamp.h" +/* + * XXX Ideally we'd switch to standard pages for SLRU data, but in the + * meantime we need some way to identify buffers that hold raw data (no + * invasive LSN, no checksums). + */ /* Note: these two macros only work on shared buffers, not local ones! */ #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) -#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr))) + +#define BufferGetLSN(bufHdr) \ + PageGetLSN(BufHdrGetBlock(bufHdr)) /* Note: this macro only works on local buffers, not shared ones! */ #define LocalBufHdrGetBlock(bufHdr) \ @@ -117,19 +124,6 @@ typedef struct CkptTsStatus int index; } CkptTsStatus; -/* - * Type for array used to sort SMgrRelations - * - * FlushRelationsAllBuffers shares the same comparator function with - * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be - * compatible. - */ -typedef struct SMgrSortArray -{ - RelFileLocator rlocator; /* This must be the first member */ - SMgrRelation srel; -} SMgrSortArray; - /* GUC variables */ bool zero_damaged_pages = false; int bgwriter_lru_maxpages = 100; @@ -459,8 +453,8 @@ ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref) ) -static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence, - ForkNumber forkNum, BlockNumber blockNum, +static Buffer ReadBuffer_common(SMgrFileHandle sfile, char relpersistence, + BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit); static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy); @@ -476,20 +470,19 @@ static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits); static void shared_buffer_write_error_callback(void *arg); static void local_buffer_write_error_callback(void *arg); -static BufferDesc *BufferAlloc(SMgrRelation smgr, +static BufferDesc *BufferAlloc(SMgrFileHandle smgr, char relpersistence, - ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr); -static void FlushBuffer(BufferDesc *buf, SMgrRelation reln); +static void FlushBuffer(BufferDesc *buf, SMgrFileHandle sfile); static void FindAndDropRelationBuffers(RelFileLocator rlocator, - ForkNumber forkNum, + ForkNumber forknum, BlockNumber nForkBlock, BlockNumber firstDelBlock); static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, - ForkNumber forkNum, bool permanent); + ForkNumber forknum, bool permanent); static void AtProcExit_Buffers(int code, Datum arg); static void CheckForBufferLeaks(void); static int rlocator_comparator(const void *p1, const void *p2); @@ -502,9 +495,7 @@ static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg); * Implementation of PrefetchBuffer() for shared buffers. */ PrefetchBufferResult -PrefetchSharedBuffer(SMgrRelation smgr_reln, - ForkNumber forkNum, - BlockNumber blockNum) +PrefetchSharedBuffer(SMgrFileHandle sfile, BlockNumber blockNum) { PrefetchBufferResult result = {InvalidBuffer, false}; BufferTag newTag; /* identity of requested block */ @@ -515,8 +506,8 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln, Assert(BlockNumberIsValid(blockNum)); /* create a tag so we can lookup the buffer */ - InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator, - forkNum, blockNum); + InitBufferTag(&newTag, &sfile->smgr_locator.locator, + sfile->smgr_locator.forknum, blockNum); /* determine its hash code and partition lock ID */ newHash = BufTableHashCode(&newTag); @@ -535,7 +526,7 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln, * Try to initiate an asynchronous read. This returns false in * recovery if the relation file doesn't exist. */ - if (smgrprefetch(smgr_reln, forkNum, blockNum)) + if (smgrprefetch(sfile, blockNum)) result.initiated_io = true; #endif /* USE_PREFETCH */ } @@ -589,7 +580,7 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln, * recovery, an error is raised). */ PrefetchBufferResult -PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) +PrefetchBuffer(Relation reln, ForkNumber forknum, BlockNumber blockNum) { Assert(RelationIsValid(reln)); Assert(BlockNumberIsValid(blockNum)); @@ -603,12 +594,12 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) errmsg("cannot access temporary tables of other sessions"))); /* pass it off to localbuf.c */ - return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum); + return PrefetchLocalBuffer(RelationGetSmgr(reln, forknum), blockNum); } else { /* pass it to the shared buffer version */ - return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum); + return PrefetchSharedBuffer(RelationGetSmgr(reln, forknum), blockNum); } } @@ -620,7 +611,7 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) * tag. In that case, the buffer is pinned and the usage count is bumped. */ bool -ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, +ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blockNum, Buffer recent_buffer) { BufferDesc *bufHdr; @@ -632,7 +623,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN ResourceOwnerEnlargeBuffers(CurrentResourceOwner); ReservePrivateRefCountEntry(); - InitBufferTag(&tag, &rlocator, forkNum, blockNum); + InitBufferTag(&tag, &rlocator, forknum, blockNum); if (BufferIsLocal(recent_buffer)) { @@ -756,7 +747,7 @@ ReadBuffer(Relation reln, BlockNumber blockNum) * See buffer/README for details. */ Buffer -ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, +ReadBufferExtended(Relation reln, ForkNumber forknum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy) { bool hit; @@ -777,8 +768,9 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, * miss. */ pgstat_count_buffer_read(reln); - buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence, - forkNum, blockNum, mode, strategy, &hit); + buf = ReadBuffer_common(RelationGetSmgr(reln, forknum), + reln->rd_rel->relpersistence, + blockNum, mode, strategy, &hit); if (hit) pgstat_count_buffer_hit(reln); return buf; @@ -796,19 +788,31 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, * BackendId). */ Buffer -ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, +ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent) { bool hit; - SMgrRelation smgr = smgropen(rlocator, InvalidBackendId); + SMgrFileHandle sfile = smgropen(rlocator, InvalidBackendId, forknum); - return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT : - RELPERSISTENCE_UNLOGGED, forkNum, blockNum, + return ReadBuffer_common(sfile, permanent ? RELPERSISTENCE_PERMANENT : + RELPERSISTENCE_UNLOGGED, blockNum, mode, strategy, &hit); } +Buffer +ReadBufferWithoutRelcacheWithHit(RelFileLocator rlocator, ForkNumber forknum, + BlockNumber blockNum, ReadBufferMode mode, + BufferAccessStrategy strategy, bool permanent, bool *hit) +{ + SMgrFileHandle sfile = smgropen(rlocator, InvalidBackendId, forknum); + + return ReadBuffer_common(sfile, permanent ? RELPERSISTENCE_PERMANENT : + RELPERSISTENCE_UNLOGGED, blockNum, + mode, strategy, hit); +} + /* * ReadBuffer_common -- common logic for all ReadBuffer variants @@ -816,7 +820,7 @@ ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, * *hit is set to true if the request was satisfied from shared buffer cache. */ static Buffer -ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, +ReadBuffer_common(SMgrFileHandle sfile, char relpersistence, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool *hit) { @@ -824,7 +828,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, Block bufBlock; bool found; bool isExtend; - bool isLocalBuf = SmgrIsTemp(smgr); + bool isLocalBuf = SmgrIsTemp(sfile); *hit = false; @@ -833,29 +837,30 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, isExtend = (blockNum == P_NEW); - TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum, - smgr->smgr_rlocator.locator.spcOid, - smgr->smgr_rlocator.locator.dbOid, - smgr->smgr_rlocator.locator.relNumber, - smgr->smgr_rlocator.backend, + TRACE_POSTGRESQL_BUFFER_READ_START(smgr->smgr_locator.forknum, + blockNum, + smgr->smgr_locator.locator.spcOid, + smgr->smgr_locator.locator.dbOid, + smgr->smgr_locator.locator.relNumber, + smgr->smgr_locator.backend, isExtend); /* Substitute proper block number if caller asked for P_NEW */ if (isExtend) { - blockNum = smgrnblocks(smgr, forkNum); + blockNum = smgrnblocks(sfile); /* Fail if relation is already at maximum possible length */ if (blockNum == P_NEW) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("cannot extend relation %s beyond %u blocks", - relpath(smgr->smgr_rlocator, forkNum), + smgrfilepath(sfile->smgr_locator), P_NEW))); } if (isLocalBuf) { - bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found); + bufHdr = LocalBufferAlloc(sfile, blockNum, &found); if (found) pgBufferUsage.local_blks_hit++; else if (isExtend) @@ -870,7 +875,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * lookup the buffer. IO_IN_PROGRESS is set if the requested block is * not currently in memory. */ - bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum, + bufHdr = BufferAlloc(sfile, relpersistence, blockNum, strategy, &found); if (found) pgBufferUsage.shared_blks_hit++; @@ -895,11 +900,12 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (VacuumCostActive) VacuumCostBalance += VacuumCostPageHit; - TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum, - smgr->smgr_rlocator.locator.spcOid, - smgr->smgr_rlocator.locator.dbOid, - smgr->smgr_rlocator.locator.relNumber, - smgr->smgr_rlocator.backend, + TRACE_POSTGRESQL_BUFFER_READ_DONE(sfile->smgr_forknum, + blockNum, + sfile->smgr_rlocator.locator.spcOid, + sfile->smgr_rlocator.locator.dbOid, + sfile->smgr_rlocator.locator.relNumber, + sfile->smgr_rlocator.backend, isExtend, found); @@ -936,7 +942,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (!PageIsNew((Page) bufBlock)) ereport(ERROR, (errmsg("unexpected data beyond EOF in block %u of relation %s", - blockNum, relpath(smgr->smgr_rlocator, forkNum)), + blockNum, smgrfilepath(sfile->smgr_locator)), errhint("This has been seen to occur with buggy kernels; consider updating your system."))); /* @@ -993,7 +999,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, /* new buffers are zero-filled */ MemSet((char *) bufBlock, 0, BLCKSZ); /* don't set checksum for all-zero page */ - smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false); + smgrextend(sfile, blockNum, (char *) bufBlock, false); /* * NB: we're *not* doing a ScheduleBufferTagForWriteback here; @@ -1018,7 +1024,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (track_io_timing) INSTR_TIME_SET_CURRENT(io_start); - smgrread(smgr, forkNum, blockNum, (char *) bufBlock); + smgrread(sfile, blockNum, (char *) bufBlock); if (track_io_timing) { @@ -1029,8 +1035,8 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* check for garbage data */ - if (!PageIsVerifiedExtended((Page) bufBlock, blockNum, - PIV_LOG_WARNING | PIV_REPORT_STAT)) + if ((!(mode == RBM_TRIM)) && (!PageIsVerifiedExtended((Page) bufBlock, blockNum, + PIV_LOG_WARNING | PIV_REPORT_STAT))) { if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages) { @@ -1038,7 +1044,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid page in block %u of relation %s; zeroing out page", blockNum, - relpath(smgr->smgr_rlocator, forkNum)))); + smgrfilepath(sfile->smgr_locator)))); MemSet((char *) bufBlock, 0, BLCKSZ); } else @@ -1046,7 +1052,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid page in block %u of relation %s", blockNum, - relpath(smgr->smgr_rlocator, forkNum)))); + smgrfilepath(sfile->smgr_locator)))); } } } @@ -1085,11 +1091,12 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, if (VacuumCostActive) VacuumCostBalance += VacuumCostPageMiss; - TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum, - smgr->smgr_rlocator.locator.spcOid, - smgr->smgr_rlocator.locator.dbOid, - smgr->smgr_rlocator.locator.relNumber, - smgr->smgr_rlocator.backend, + TRACE_POSTGRESQL_BUFFER_READ_DONE(sfile->smgr_locator.forknum, + blockNum, + sfile->smgr_locator.locator.spcOid, + sfile->smgr_locator.locator.dbOid, + sfile->smgr_locator.locator.relNumber, + sfile->smgr_locator.backend, isExtend, found); @@ -1116,7 +1123,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * No locks are held either at entry or exit. */ static BufferDesc * -BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, +BufferAlloc(SMgrFileHandle sfile, char relpersistence, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr) @@ -1134,7 +1141,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, uint32 buf_state; /* create a tag so we can lookup the buffer */ - InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum); + InitBufferTag(&newTag, &sfile->smgr_locator.locator, sfile->smgr_locator.forknum, blockNum); /* determine its hash code and partition lock ID */ newHash = BufTableHashCode(&newTag); @@ -1264,10 +1271,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } /* OK, do the I/O */ - TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum, - smgr->smgr_rlocator.locator.spcOid, - smgr->smgr_rlocator.locator.dbOid, - smgr->smgr_rlocator.locator.relNumber); + TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(sfile->smgr_locator.forknum, + blockNum, + sfile->smgr_locator.locator.spcOid, + sfile->smgr_locator.locator.dbOid, + sfile->smgr_locator.locator.relNumber); FlushBuffer(buf, NULL); LWLockRelease(BufferDescriptorGetContentLock(buf)); @@ -1275,10 +1283,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, ScheduleBufferTagForWriteback(&BackendWritebackContext, &buf->tag); - TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum, - smgr->smgr_rlocator.locator.spcOid, - smgr->smgr_rlocator.locator.dbOid, - smgr->smgr_rlocator.locator.relNumber); + TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(sfile->smgr_locator.forknum, + blockNum, + sfile->smgr_locator.locator.spcOid, + sfile->smgr_locator.locator.dbOid, + sfile->smgr_locator.locator.relNumber); } else { @@ -1434,7 +1443,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT | BUF_USAGECOUNT_MASK); - if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM) + if (relpersistence == RELPERSISTENCE_PERMANENT || sfile->smgr_locator.forknum == INIT_FORKNUM) buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE; else buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; @@ -1647,7 +1656,7 @@ ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum) { - ForkNumber forkNum = MAIN_FORKNUM; + ForkNumber forknum = MAIN_FORKNUM; BufferDesc *bufHdr; if (BufferIsValid(buffer)) @@ -1658,7 +1667,7 @@ ReleaseAndReadBuffer(Buffer buffer, bufHdr = GetLocalBufferDescriptor(-buffer - 1); if (bufHdr->tag.blockNum == blockNum && BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && - BufTagGetForkNum(&bufHdr->tag) == forkNum) + BufTagGetForkNum(&bufHdr->tag) == forknum) return buffer; ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer); LocalRefCount[-buffer - 1]--; @@ -1669,7 +1678,7 @@ ReleaseAndReadBuffer(Buffer buffer, /* we have pin, so it's ok to examine tag without spinlock */ if (bufHdr->tag.blockNum == blockNum && BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && - BufTagGetForkNum(&bufHdr->tag) == forkNum) + BufTagGetForkNum(&bufHdr->tag) == forknum) return buffer; UnpinBuffer(bufHdr); } @@ -2820,7 +2829,7 @@ BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, * as the second parameter. If not, pass NULL. */ static void -FlushBuffer(BufferDesc *buf, SMgrRelation reln) +FlushBuffer(BufferDesc *buf, SMgrFileHandle sfile) { XLogRecPtr recptr; ErrorContextCallback errcallback; @@ -2845,14 +2854,14 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) error_context_stack = &errcallback; /* Find smgr relation for buffer */ - if (reln == NULL) - reln = smgropen(BufTagGetRelFileLocator(&buf->tag), InvalidBackendId); + if (sfile == NULL) + sfile = smgropen(BufTagGetRelFileLocator(&buf->tag), InvalidBackendId, BufTagGetForkNum(&buf->tag)); - TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag), + TRACE_POSTGRESQL_BUFFER_FLUSH_START(sfile->smgr_locator.forknum, buf->tag.blockNum, - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber); + sfile->smgr_locator.locator.spcOid, + sfile->smgr_locator.locator.dbOid, + sfile->smgr_locator.locator.relNumber); buf_state = LockBufHdr(buf); @@ -2906,8 +2915,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) /* * bufToWrite is either the shared buffer or a copy, as appropriate. */ - smgrwrite(reln, - BufTagGetForkNum(&buf->tag), + smgrwrite(sfile, buf->tag.blockNum, bufToWrite, false); @@ -2928,11 +2936,11 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) */ TerminateBufferIO(buf, true, 0); - TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag), + TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(sfile->smgr_locator.forknum, buf->tag.blockNum, - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber); + sfile->smgr_locator.locator.spcOid, + sfile->smgr_locator.locator.dbOid, + sfile->smgr_locator.locator.relNumber); /* Pop the error context stack */ error_context_stack = errcallback.previous; @@ -2947,7 +2955,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) * it might not be. */ BlockNumber -RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum) +RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forknum) { if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind)) { @@ -2959,13 +2967,13 @@ RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum) */ uint64 szbytes; - szbytes = table_relation_size(relation, forkNum); + szbytes = table_relation_size(relation, forknum); return (szbytes + (BLCKSZ - 1)) / BLCKSZ; } else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) { - return smgrnblocks(RelationGetSmgr(relation), forkNum); + return smgrnblocks(RelationGetSmgr(relation, forknum)); } else Assert(false); @@ -3028,6 +3036,7 @@ BufferGetLSNAtomic(Buffer buffer) buf_state = LockBufHdr(bufHdr); lsn = PageGetLSN(page); + UnlockBufHdr(bufHdr, buf_state); return lsn; @@ -3055,26 +3064,20 @@ BufferGetLSNAtomic(Buffer buffer) * -------------------------------------------------------------------- */ void -DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, +DropRelationBuffers(RelFileLocator rlocator, BackendId backend, ForkNumber *forknum, int nforks, BlockNumber *firstDelBlock) { int i; int j; - RelFileLocatorBackend rlocator; BlockNumber nForkBlock[MAX_FORKNUM]; uint64 nBlocksToInvalidate = 0; - rlocator = smgr_reln->smgr_rlocator; - /* If it's a local relation, it's localbuf.c's problem. */ - if (RelFileLocatorBackendIsTemp(rlocator)) + if (backend == MyBackendId) { - if (rlocator.backend == MyBackendId) - { - for (j = 0; j < nforks; j++) - DropRelationLocalBuffers(rlocator.locator, forkNum[j], - firstDelBlock[j]); - } + for (j = 0; j < nforks; j++) + DropRelationLocalBuffers(rlocator, forknum[j], + firstDelBlock[j]); return; } @@ -3103,7 +3106,10 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, for (i = 0; i < nforks; i++) { /* Get the number of blocks for a relation's fork */ - nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]); + SMgrFileHandle sfile; + + sfile = smgropen(rlocator, backend, forknum[i]); + nForkBlock[i] = smgrnblocks_cached(sfile); if (nForkBlock[i] == InvalidBlockNumber) { @@ -3123,7 +3129,7 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD) { for (j = 0; j < nforks; j++) - FindAndDropRelationBuffers(rlocator.locator, forkNum[j], + FindAndDropRelationBuffers(rlocator, forknum[j], nForkBlock[j], firstDelBlock[j]); return; } @@ -3146,18 +3152,18 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, * false positives are safe because we'll recheck after getting the * buffer lock. * - * We could check forkNum and blockNum as well as the rlocator, but + * We could check forknum and blockNum as well as the rlocator, but * the incremental win from doing so seems small. */ - if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator)) + if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator)) continue; buf_state = LockBufHdr(bufHdr); for (j = 0; j < nforks; j++) { - if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) && - BufTagGetForkNum(&bufHdr->tag) == forkNum[j] && + if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) && + BufTagGetForkNum(&bufHdr->tag) == forknum[j] && bufHdr->tag.blockNum >= firstDelBlock[j]) { InvalidateBuffer(bufHdr); /* releases spinlock */ @@ -3178,11 +3184,10 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, * -------------------------------------------------------------------- */ void -DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) +DropRelationsAllBuffers(RelFileLocatorBackend *rlocators, int nlocators) { int i; int n = 0; - SMgrRelation *rels; BlockNumber (*block)[MAX_FORKNUM + 1]; uint64 nBlocksToInvalidate = 0; RelFileLocator *locators; @@ -3192,18 +3197,18 @@ DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) if (nlocators == 0) return; - rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */ + locators = palloc(sizeof(RelFileLocator) * nlocators); /* non-local relations */ /* If it's a local relation, it's localbuf.c's problem. */ for (i = 0; i < nlocators; i++) { - if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator)) + if (rlocators[i].backend != InvalidBackendId) { - if (smgr_reln[i]->smgr_rlocator.backend == MyBackendId) - DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator); + if (rlocators[i].backend == MyBackendId) + DropRelationAllLocalBuffers(rlocators[i].locator); } else - rels[n++] = smgr_reln[i]; + locators[n++] = rlocators[i].locator; } /* @@ -3212,7 +3217,7 @@ DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) */ if (n == 0) { - pfree(rels); + pfree(locators); return; } @@ -3232,12 +3237,13 @@ DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) for (int j = 0; j <= MAX_FORKNUM; j++) { /* Get the number of blocks for a relation's fork. */ - block[i][j] = smgrnblocks_cached(rels[i], j); + SMgrFileHandle sfile = smgropen(locators[i], InvalidBackendId, j); + block[i][j] = smgrnblocks_cached(sfile); /* We need to only consider the relation forks that exists. */ if (block[i][j] == InvalidBlockNumber) { - if (!smgrexists(rels[i], j)) + if (!smgrexists(sfile)) continue; cached = false; break; @@ -3263,20 +3269,17 @@ DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) continue; /* drop all the buffers for a particular relation fork */ - FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator, + FindAndDropRelationBuffers(locators[i], j, block[i][j], 0); } } pfree(block); - pfree(rels); + pfree(locators); return; } pfree(block); - locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */ - for (i = 0; i < n; i++) - locators[i] = rels[i]->smgr_rlocator.locator; /* * For low number of relations to drop just use a simple walk through, to @@ -3336,7 +3339,6 @@ DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) } pfree(locators); - pfree(rels); } /* --------------------------------------------------------------------- @@ -3349,7 +3351,7 @@ DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) * -------------------------------------------------------------------- */ static void -FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, +FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forknum, BlockNumber nForkBlock, BlockNumber firstDelBlock) { @@ -3365,7 +3367,7 @@ FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, uint32 buf_state; /* create a tag so we can lookup the buffer */ - InitBufferTag(&bufTag, &rlocator, forkNum, curBlock); + InitBufferTag(&bufTag, &rlocator, forknum, curBlock); /* determine its hash code and partition lock ID */ bufHash = BufTableHashCode(&bufTag); @@ -3390,7 +3392,7 @@ FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, buf_state = LockBufHdr(bufHdr); if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) && - BufTagGetForkNum(&bufHdr->tag) == forkNum && + BufTagGetForkNum(&bufHdr->tag) == forknum && bufHdr->tag.blockNum >= firstDelBlock) InvalidateBuffer(bufHdr); /* releases spinlock */ else @@ -3545,8 +3547,7 @@ FlushRelationBuffers(Relation rel) PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); - smgrwrite(RelationGetSmgr(rel), - BufTagGetForkNum(&bufHdr->tag), + smgrwrite(RelationGetSmgr(rel, BufTagGetForkNum(&bufHdr->tag)), bufHdr->tag.blockNum, localpage, false); @@ -3586,7 +3587,7 @@ FlushRelationBuffers(Relation rel) { PinBuffer_Locked(bufHdr); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, RelationGetSmgr(rel)); + FlushBuffer(bufHdr, RelationGetSmgr(rel, bufHdr->tag.forkNum)); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr); } @@ -3605,25 +3606,20 @@ FlushRelationBuffers(Relation rel) * -------------------------------------------------------------------- */ void -FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) +FlushRelationsAllBuffers(RelFileLocator *rels, int nrels) { int i; - SMgrSortArray *srels; + RelFileLocator *locators = NULL; bool use_bsearch; if (nrels == 0) return; /* fill-in array for qsort */ - srels = palloc(sizeof(SMgrSortArray) * nrels); + locators = palloc(sizeof(RelFileLocator) * nrels); for (i = 0; i < nrels; i++) - { - Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator)); - - srels[i].rlocator = smgrs[i]->smgr_rlocator.locator; - srels[i].srel = smgrs[i]; - } + locators[i] = rels[i]; /* * Save the bsearch overhead for low number of relations to sync. See @@ -3631,16 +3627,16 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) */ use_bsearch = nrels > RELS_BSEARCH_THRESHOLD; - /* sort the list of SMgrRelations if necessary */ + /* sort the list of locators if necessary */ if (use_bsearch) - pg_qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator); + pg_qsort(locators, nrels, sizeof(RelFileLocator), rlocator_comparator); /* Make sure we can handle the pin inside the loop */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); for (i = 0; i < NBuffers; i++) { - SMgrSortArray *srelent = NULL; + RelFileLocator *found = NULL; BufferDesc *bufHdr = GetBufferDescriptor(i); uint32 buf_state; @@ -3655,9 +3651,9 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) for (j = 0; j < nrels; j++) { - if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator)) + if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j])) { - srelent = &srels[j]; + found = &locators[j]; break; } } @@ -3667,24 +3663,26 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) RelFileLocator rlocator; rlocator = BufTagGetRelFileLocator(&bufHdr->tag); - srelent = bsearch((const void *) &(rlocator), - srels, nrels, sizeof(SMgrSortArray), + found = bsearch((const void *) &(rlocator), + locators, nrels, sizeof(RelFileLocator), rlocator_comparator); } /* buffer doesn't belong to any of the given relfilelocators; skip it */ - if (srelent == NULL) + if (found == NULL) continue; + /* FIXME: cache SMgrFileHandles for the rels, and pass to FlushBuffer */ + ReservePrivateRefCountEntry(); buf_state = LockBufHdr(bufHdr); - if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) && + if (BufTagMatchesRelFileLocator(&bufHdr->tag, found) && (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) { PinBuffer_Locked(bufHdr); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, srelent->srel); + FlushBuffer(bufHdr, NULL); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr); } @@ -3692,7 +3690,7 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) UnlockBufHdr(bufHdr, buf_state); } - pfree(srels); + pfree(locators); } /* --------------------------------------------------------------------- @@ -3708,7 +3706,7 @@ FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, - ForkNumber forkNum, bool permanent) + ForkNumber forknum, bool permanent) { Buffer srcBuf; Buffer dstBuf; @@ -3726,11 +3724,11 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, * can skip it when copying any fork of an unlogged relation other than * the init fork. */ - use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM); + use_wal = XLogIsNeeded() && (permanent || forknum == INIT_FORKNUM); /* Get number of blocks in the source relation. */ - nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId), - forkNum); + nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId, forknum)); + /* Nothing to copy; just return. */ if (nblocks == 0) @@ -3741,7 +3739,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, * relation before starting to copy block by block. */ memset(buf.data, 0, BLCKSZ); - smgrextend(smgropen(dstlocator, InvalidBackendId), forkNum, nblocks - 1, + smgrextend(smgropen(dstlocator, InvalidBackendId, forknum), nblocks - 1, buf.data, true); /* This is a bulk operation, so use buffer access strategies. */ @@ -3754,14 +3752,14 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, CHECK_FOR_INTERRUPTS(); /* Read block from source relation. */ - srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno, + srcBuf = ReadBufferWithoutRelcache(srclocator, forknum, blkno, RBM_NORMAL, bstrategy_src, permanent); LockBuffer(srcBuf, BUFFER_LOCK_SHARE); srcPage = BufferGetPage(srcBuf); /* Use P_NEW to extend the destination relation. */ - dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno, + dstBuf = ReadBufferWithoutRelcache(dstlocator, forknum, blkno, RBM_NORMAL, bstrategy_dst, permanent); LockBuffer(dstBuf, BUFFER_LOCK_EXCLUSIVE); @@ -3799,7 +3797,6 @@ void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent) { - RelFileLocatorBackend rlocator; char relpersistence; /* Set the relpersistence. */ @@ -3819,34 +3816,30 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator, permanent); /* copy those extra forks that exist */ - for (ForkNumber forkNum = MAIN_FORKNUM + 1; - forkNum <= MAX_FORKNUM; forkNum++) + for (ForkNumber forknum = MAIN_FORKNUM + 1; + forknum <= MAX_FORKNUM; forknum++) { - if (smgrexists(smgropen(src_rlocator, InvalidBackendId), forkNum)) + if (smgrexists(smgropen(src_rlocator, InvalidBackendId, forknum))) { - smgrcreate(smgropen(dst_rlocator, InvalidBackendId), forkNum, false); + smgrcreate(smgropen(dst_rlocator, InvalidBackendId, forknum), false); /* * WAL log creation if the relation is persistent, or this is the * init fork of an unlogged relation. */ - if (permanent || forkNum == INIT_FORKNUM) - log_smgrcreate(&dst_rlocator, forkNum); + if (permanent || forknum == INIT_FORKNUM) + log_smgrcreate(&dst_rlocator, forknum); /* Copy a fork's data, block by block. */ - RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum, + RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forknum, permanent); } } - /* close source and destination smgr if exists. */ - rlocator.backend = InvalidBackendId; - rlocator.locator = src_rlocator; - smgrcloserellocator(rlocator); + smgrcloserellocator(src_rlocator, InvalidBackendId); - rlocator.locator = dst_rlocator; - smgrcloserellocator(rlocator); + smgrcloserellocator(dst_rlocator, InvalidBackendId); } /* --------------------------------------------------------------------- @@ -4967,7 +4960,7 @@ IssuePendingWritebacks(WritebackContext *context) { PendingWriteback *cur; PendingWriteback *next; - SMgrRelation reln; + SMgrFileHandle sfile; int ahead; BufferTag tag; RelFileLocator currlocator; @@ -5007,8 +5000,8 @@ IssuePendingWritebacks(WritebackContext *context) i += ahead; /* and finally tell the kernel to write the data to storage */ - reln = smgropen(currlocator, InvalidBackendId); - smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks); + sfile = smgropen(currlocator, InvalidBackendId, BufTagGetForkNum(&tag)); + smgrwriteback(sfile, tag.blockNum, nblocks); } context->nr_pending = 0; @@ -5030,3 +5023,29 @@ TestForOldSnapshot_impl(Snapshot snapshot, Relation relation) (errcode(ERRCODE_SNAPSHOT_TOO_OLD), errmsg("snapshot too old"))); } + +/* + * Check if a buffer tag is currently mapped. + * + * XXX Dubious semantics; needed only for multixact's handling for + * inconsistent states. + */ +bool +BufferProbe(RelFileLocator rlocator, ForkNumber forknum, BlockNumber blockNum) +{ + BufferTag tag; + uint32 hash; + LWLock *partitionLock; + int buf_id; + + InitBufferTag(&tag, &rlocator, forknum, blockNum); + + hash = BufTableHashCode(&tag); + partitionLock = BufMappingPartitionLock(hash); + + LWLockAcquire(partitionLock, LW_SHARED); + buf_id = BufTableLookup(&tag, hash); + LWLockRelease(partitionLock); + + return buf_id >= 0; +} diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 30d67d1c40d..a5629532d72 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -61,14 +61,13 @@ static Block GetLocalBufferStorage(void); * No-op if prefetching isn't compiled in. */ PrefetchBufferResult -PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, - BlockNumber blockNum) +PrefetchLocalBuffer(SMgrFileHandle sfile, BlockNumber blockNum) { PrefetchBufferResult result = {InvalidBuffer, false}; BufferTag newTag; /* identity of requested block */ LocalBufferLookupEnt *hresult; - InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum); + InitBufferTag(&newTag, &sfile->smgr_locator.locator, sfile->smgr_locator.forknum, blockNum); /* Initialize local buffers if first request in this session */ if (LocalBufHash == NULL) @@ -87,7 +86,7 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, { #ifdef USE_PREFETCH /* Not in buffers, so initiate prefetch */ - smgrprefetch(smgr, forkNum, blockNum); + smgrprefetch(sfile, blockNum); result.initiated_io = true; #endif /* USE_PREFETCH */ } @@ -106,8 +105,7 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, * (hence, usage_count is always advanced). */ BufferDesc * -LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, - bool *foundPtr) +LocalBufferAlloc(SMgrFileHandle sfile, BlockNumber blockNum, bool *foundPtr) { BufferTag newTag; /* identity of requested block */ LocalBufferLookupEnt *hresult; @@ -117,7 +115,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool found; uint32 buf_state; - InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum); + InitBufferTag(&newTag, &sfile->smgr_locator.locator, sfile->smgr_locator.forknum, blockNum); /* Initialize local buffers if first request in this session */ if (LocalBufHash == NULL) @@ -134,7 +132,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, Assert(BufferTagsEqual(&bufHdr->tag, &newTag)); #ifdef LBDEBUG fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n", - smgr->smgr_rlocator.locator.relNumber, forkNum, blockNum, -b - 1); + sfile->smgr_locator.locator.relNumber, sfile->smgr_locator.forknum, blockNum, -b - 1); #endif buf_state = pg_atomic_read_u32(&bufHdr->state); @@ -162,7 +160,7 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, #ifdef LBDEBUG fprintf(stderr, "LB ALLOC (%u,%d,%d) %d\n", - smgr->smgr_rlocator.locator.relNumber, forkNum, blockNum, + sfile->smgr_locator.locator.relNumber, sfile->smgr_locator.forknum, blockNum, -nextFreeLocalBuf - 1); #endif @@ -211,17 +209,16 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, */ if (buf_state & BM_DIRTY) { - SMgrRelation oreln; + SMgrFileHandle ofile; Page localpage = (char *) LocalBufHdrGetBlock(bufHdr); - /* Find smgr relation for buffer */ - oreln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyBackendId); + /* Find smgr file handle for buffer */ + ofile = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyBackendId, BufTagGetForkNum(&bufHdr->tag)); PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); /* And write... */ - smgrwrite(oreln, - BufTagGetForkNum(&bufHdr->tag), + smgrwrite(ofile, bufHdr->tag.blockNum, localpage, false); diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index a6b05331032..8636f6bec43 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -271,7 +271,7 @@ FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks) * If no FSM has been created yet for this relation, there's nothing to * truncate. */ - if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM)) + if (!smgrexists(RelationGetSmgr(rel, FSM_FORKNUM))) return InvalidBlockNumber; /* Get the location in the FSM of the first removed heap block */ @@ -317,7 +317,7 @@ FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks) else { new_nfsmblocks = fsm_logical_to_physical(first_removed_address); - if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks) + if (smgrnblocks(RelationGetSmgr(rel, FSM_FORKNUM)) <= new_nfsmblocks) return InvalidBlockNumber; /* nothing to do; the FSM was already * smaller */ } @@ -532,14 +532,14 @@ fsm_readbuf(Relation rel, FSMAddress addr, bool extend) { BlockNumber blkno = fsm_logical_to_physical(addr); Buffer buf; - SMgrRelation reln; + SMgrFileHandle fsm_file; /* * Caution: re-using this smgr pointer could fail if the relcache entry * gets closed. It's safe as long as we only do smgr-level operations * between here and the last use of the pointer. */ - reln = RelationGetSmgr(rel); + fsm_file = RelationGetSmgr(rel, FSM_FORKNUM); /* * If we haven't cached the size of the FSM yet, check it first. Also @@ -547,19 +547,19 @@ fsm_readbuf(Relation rel, FSMAddress addr, bool extend) * value might be stale. (We send smgr inval messages on truncation, but * not on extension.) */ - if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber || - blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM]) + if (fsm_file->smgr_cached_nblocks == InvalidBlockNumber || + blkno >= fsm_file->smgr_cached_nblocks) { /* Invalidate the cache so smgrnblocks asks the kernel. */ - reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber; - if (smgrexists(reln, FSM_FORKNUM)) - smgrnblocks(reln, FSM_FORKNUM); + fsm_file->smgr_cached_nblocks = InvalidBlockNumber; + if (smgrexists(fsm_file)) + smgrnblocks(fsm_file); else - reln->smgr_cached_nblocks[FSM_FORKNUM] = 0; + fsm_file->smgr_cached_nblocks = 0; } /* Handle requests beyond EOF */ - if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM]) + if (blkno >= fsm_file->smgr_cached_nblocks) { if (extend) fsm_extend(rel, blkno + 1); @@ -609,7 +609,7 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks) { BlockNumber fsm_nblocks_now; PGAlignedBlock pg; - SMgrRelation reln; + SMgrFileHandle fsm_file; PageInit((Page) pg.data, BLCKSZ, 0); @@ -630,29 +630,28 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks) * gets closed. It's safe as long as we only do smgr-level operations * between here and the last use of the pointer. */ - reln = RelationGetSmgr(rel); + fsm_file = RelationGetSmgr(rel, FSM_FORKNUM); /* * Create the FSM file first if it doesn't exist. If * smgr_cached_nblocks[FSM_FORKNUM] is positive then it must exist, no * need for an smgrexists call. */ - if ((reln->smgr_cached_nblocks[FSM_FORKNUM] == 0 || - reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber) && - !smgrexists(reln, FSM_FORKNUM)) - smgrcreate(reln, FSM_FORKNUM, false); + if ((fsm_file->smgr_cached_nblocks == 0 || + fsm_file->smgr_cached_nblocks == InvalidBlockNumber) && + !smgrexists(fsm_file)) + smgrcreate(fsm_file, false); /* Invalidate cache so that smgrnblocks() asks the kernel. */ - reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber; - fsm_nblocks_now = smgrnblocks(reln, FSM_FORKNUM); + fsm_file->smgr_cached_nblocks = InvalidBlockNumber; + fsm_nblocks_now = smgrnblocks(fsm_file); /* Extend as needed. */ while (fsm_nblocks_now < fsm_nblocks) { PageSetChecksumInplace((Page) pg.data, fsm_nblocks_now); - smgrextend(reln, FSM_FORKNUM, fsm_nblocks_now, - pg.data, false); + smgrextend(fsm_file, fsm_nblocks_now, pg.data, false); fsm_nblocks_now++; } diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index b204ecdbc32..240f0e1a3ff 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -117,9 +117,7 @@ CalculateShmemSize(int *num_semaphores) size = add_size(size, XLogPrefetchShmemSize()); size = add_size(size, XLOGShmemSize()); size = add_size(size, XLogRecoveryShmemSize()); - size = add_size(size, CLOGShmemSize()); size = add_size(size, CommitTsShmemSize()); - size = add_size(size, SUBTRANSShmemSize()); size = add_size(size, TwoPhaseShmemSize()); size = add_size(size, BackgroundWorkerShmemSize()); size = add_size(size, MultiXactShmemSize()); @@ -241,9 +239,7 @@ CreateSharedMemoryAndSemaphores(void) XLOGShmemInit(); XLogPrefetchShmemInit(); XLogRecoveryShmemInit(); - CLOGShmemInit(); CommitTsShmemInit(); - SUBTRANSShmemInit(); MultiXactShmemInit(); InitBufferPool(); diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index df1c0d72e97..9c1674ebb0d 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -313,17 +313,9 @@ ((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \ << LOG2_NUM_PREDICATELOCK_PARTITIONS) - -/* - * The SLRU buffer area through which we access the old xids. - */ -static SlruCtlData SerialSlruCtlData; - -#define SerialSlruCtl (&SerialSlruCtlData) - #define SERIAL_PAGESIZE BLCKSZ #define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo) -#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE) +#define SERIAL_ENTRIESPERPAGE ((SERIAL_PAGESIZE - SizeOfPageHeaderData) / SERIAL_ENTRYSIZE) /* * Set maximum pages based on the number needed to track all transactions. @@ -332,12 +324,13 @@ static SlruCtlData SerialSlruCtlData; #define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1) -#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \ - (SerialSlruCtl->shared->page_buffer[slotno] + \ +#define SerialValue(buffer, xid) (*((SerCommitSeqNo *) \ + (PageGetContents(BufferGetPage(buffer)) + \ ((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE)))) #define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE) + typedef struct SerialControlData { int headPage; /* newest initialized page */ @@ -849,10 +842,14 @@ SerialPagePrecedesLogicallyUnitTests(void) * requires burning ~2B XIDs in single-user mode, a negligible * possibility. Moreover, if it does happen, the consequence would be * mild, namely a new transaction failing in SimpleLruReadPage(). + * + * NOTE: After adding page headers, the defect affects two pages. + * We now assert correct treatment of its second to prior page. + * */ headPage = oldestPage; targetPage = newestPage; - Assert(SerialPagePrecedesLogically(headPage, targetPage - 1)); + Assert(SerialPagePrecedesLogically(headPage, targetPage - 2)); #if 0 Assert(SerialPagePrecedesLogically(headPage, targetPage)); #endif @@ -867,17 +864,10 @@ SerialInit(void) { bool found; - /* - * Set up SLRU management of the pg_serial data. - */ - SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically; - SimpleLruInit(SerialSlruCtl, "Serial", - NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial", - LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE); #ifdef USE_ASSERT_CHECKING SerialPagePrecedesLogicallyUnitTests(); #endif - SlruPagePrecedesUnitTests(SerialSlruCtl, SERIAL_ENTRIESPERPAGE); + SlruPagePrecedesUnitTests(SerialPagePrecedesLogically, SERIAL_ENTRIESPERPAGE); /* * Create or attach to the SerialControl structure. @@ -907,9 +897,9 @@ SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo) { TransactionId tailXid; int targetPage; - int slotno; int firstZeroPage; bool isNewPage; + Buffer buffer; Assert(TransactionIdIsValid(xid)); @@ -954,16 +944,23 @@ SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo) /* Initialize intervening pages. */ while (firstZeroPage != targetPage) { - (void) SimpleLruZeroPage(SerialSlruCtl, firstZeroPage); + buffer = ZeroSlruBuffer(SLRU_SERIAL_ID, firstZeroPage); + PageSetHeaderDataNonRel(BufferGetPage(buffer), firstZeroPage, InvalidXLogRecPtr, BLCKSZ, PG_METAPAGE_LAYOUT_VERSION); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); firstZeroPage = SerialNextPage(firstZeroPage); } - slotno = SimpleLruZeroPage(SerialSlruCtl, targetPage); + buffer = ZeroSlruBuffer(SLRU_SERIAL_ID, targetPage); } else - slotno = SimpleLruReadPage(SerialSlruCtl, targetPage, true, xid); + { + buffer = ReadSlruBuffer(SLRU_SERIAL_ID, targetPage, RBM_NORMAL); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + } - SerialValue(slotno, xid) = minConflictCommitSeqNo; - SerialSlruCtl->shared->page_dirty[slotno] = true; + SerialValue(buffer, xid) = minConflictCommitSeqNo; + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); LWLockRelease(SerialSLRULock); } @@ -979,7 +976,7 @@ SerialGetMinConflictCommitSeqNo(TransactionId xid) TransactionId headXid; TransactionId tailXid; SerCommitSeqNo val; - int slotno; + Buffer buffer; Assert(TransactionIdIsValid(xid)); @@ -1001,9 +998,9 @@ SerialGetMinConflictCommitSeqNo(TransactionId xid) * The following function must be called without holding SerialSLRULock, * but will return with that lock held, which must then be released. */ - slotno = SimpleLruReadPage_ReadOnly(SerialSlruCtl, - SerialPage(xid), xid); - val = SerialValue(slotno, xid); + buffer = ReadSlruBuffer(SLRU_SERIAL_ID, SerialPage(xid), RBM_NORMAL); + val = SerialValue(buffer, xid); + ReleaseBuffer(buffer); LWLockRelease(SerialSLRULock); return val; } @@ -1122,19 +1119,7 @@ CheckPointPredicate(void) LWLockRelease(SerialSLRULock); /* Truncate away pages that are no longer required */ - SimpleLruTruncate(SerialSlruCtl, tailPage); - - /* - * Write dirty SLRU pages to disk - * - * This is not actually necessary from a correctness point of view. We do - * it merely as a debugging aid. - * - * We're doing this after the truncation to avoid writing pages right - * before deleting the file in which they sit, which would be completely - * pointless. - */ - SimpleLruWriteAll(SerialSlruCtl, true); + SimpleLruTruncate(SLRU_SERIAL_ID, SerialPagePrecedesLogically, tailPage); } /*------------------------------------------------------------------------*/ @@ -1396,7 +1381,6 @@ PredicateLockShmemSize(void) /* Shared memory structures for SLRU tracking of old committed xids. */ size = add_size(size, sizeof(SerialControlData)); - size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0)); return size; } diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 8b617c7e79d..857cce9a28c 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -59,6 +59,27 @@ PageInit(Page page, Size pageSize, Size specialSize) /* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */ } +void +PageInitSLRU(Page page, Size pageSize, Size specialSize) +{ + PageHeader p = (PageHeader) page; + + specialSize = MAXALIGN(specialSize); + + Assert(pageSize == BLCKSZ); + Assert(pageSize > specialSize + SizeOfPageHeaderData); + + /* Make sure all fields of page are zero, as well as unused space */ + MemSet(p, 0, pageSize); + + p->pd_flags = 0; + p->pd_lower = SizeOfPageHeaderData; + p->pd_upper = pageSize - specialSize; + p->pd_special = pageSize - specialSize; + PageSetPageSizeAndVersion(page, pageSize, PG_METAPAGE_LAYOUT_VERSION); + /* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */ +} + /* * PageIsVerifiedExtended @@ -103,7 +124,7 @@ PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) if (DataChecksumsEnabled()) { checksum = pg_checksum_page((char *) page, blkno); - + if (checksum != p->pd_checksum) checksum_failure = true; } diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 14b6fa0fd90..b9a41cb9427 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -66,10 +66,10 @@ * out to an unlinked old copy of a segment file that will eventually * disappear. * - * File descriptors are stored in the per-fork md_seg_fds arrays inside - * SMgrRelation. The length of these arrays is stored in md_num_open_segs. - * Note that a fork's md_num_open_segs having a specific value does not - * necessarily mean the relation doesn't have additional segments; we may + * File descriptors are stored in the md_seg_fds array inside + * SMgrFileData. The length of the array is stored in md_num_open_segs. + * Note that md_num_open_segs having a specific value does not + * necessarily mean the file doesn't have additional segments; we may * just not have opened the next segment yet. (We could not have "all * segments are in the array" as an invariant anyway, since another backend * could extend the relation while we aren't looking.) We do not have @@ -121,26 +121,18 @@ static MemoryContext MdCxt; /* context for all MdfdVec objects */ /* local routines */ -static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, - bool isRedo); -static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior); -static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, - MdfdVec *seg); -static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, - BlockNumber segno); -static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, - BlockNumber segno); -static void _fdvec_resize(SMgrRelation reln, - ForkNumber forknum, - int nseg); -static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, - BlockNumber segno); -static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, +static void mdunlinkfork(SMgrFileLocator slocator, bool isRedo); +static MdfdVec *mdopenfork(SMgrFileHandle sfile, int behavior); +static void register_dirty_segment(SMgrFileHandle sfile, MdfdVec *seg); +static void register_unlink_segment(SMgrFileLocator slocator, BlockNumber segno); +static void register_forget_request(SMgrFileLocator slocator, BlockNumber segno); +static void _fdvec_resize(SMgrFileHandle sfile, int nseg); +static char *_mdfd_segpath(SMgrFileHandle sfile, BlockNumber segno); +static MdfdVec *_mdfd_openseg(SMgrFileHandle sfile, BlockNumber segno, int oflags); -static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, +static MdfdVec *_mdfd_getseg(SMgrFileHandle sfile, BlockNumber blkno, bool skipFsync, int behavior); -static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, - MdfdVec *seg); +static BlockNumber _mdnblocks(SMgrFileHandle sfile, MdfdVec *seg); /* @@ -160,7 +152,7 @@ mdinit(void) * Note: this will return true for lingering files, with pending deletions */ bool -mdexists(SMgrRelation reln, ForkNumber forknum) +mdexists(SMgrFileHandle sfile) { /* * Close it first, to ensure that we notice if the fork has been unlinked @@ -168,9 +160,9 @@ mdexists(SMgrRelation reln, ForkNumber forknum) * which already closes relations when dropping them. */ if (!InRecovery) - mdclose(reln, forknum); + mdclose(sfile); - return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL); + return (mdopenfork(sfile, EXTENSION_RETURN_NULL) != NULL); } /* @@ -179,16 +171,16 @@ mdexists(SMgrRelation reln, ForkNumber forknum) * If isRedo is true, it's okay for the relation to exist already. */ void -mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) +mdcreate(SMgrFileHandle sfile, bool isRedo) { MdfdVec *mdfd; char *path; File fd; - if (isRedo && reln->md_num_open_segs[forknum] > 0) + if (isRedo && sfile->md_num_open_segs > 0) return; /* created and opened already... */ - Assert(reln->md_num_open_segs[forknum] == 0); + Assert(sfile->md_num_open_segs == 0); /* * We may be using the target table space for the first time in this @@ -199,11 +191,14 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) * should be here and not in commands/tablespace.c? But that would imply * importing a lot of stuff that smgr.c oughtn't know, either. */ - TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - isRedo); + if (sfile->smgr_locator.locator.spcOid != SLRU_SPC_OID) + { + TablespaceCreateDbspace(sfile->smgr_locator.locator.spcOid, + sfile->smgr_locator.locator.dbOid, + isRedo); + } - path = relpath(reln->smgr_rlocator, forknum); + path = smgrfilepath(sfile->smgr_locator); fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); @@ -225,8 +220,8 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) pfree(path); - _fdvec_resize(reln, forknum, 1); - mdfd = &reln->md_seg_fds[forknum][0]; + _fdvec_resize(sfile, 1); + mdfd = &sfile->md_seg_fds[0]; mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; } @@ -293,16 +288,9 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) * we are usually not in a transaction anymore when this is called. */ void -mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) +mdunlink(SMgrFileLocator slocator, bool isRedo) { - /* Now do the per-fork work */ - if (forknum == InvalidForkNumber) - { - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - mdunlinkfork(rlocator, forknum, isRedo); - } - else - mdunlinkfork(rlocator, forknum, isRedo); + mdunlinkfork(slocator, isRedo); } /* @@ -330,29 +318,29 @@ do_truncate(const char *path) } static void -mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) +mdunlinkfork(SMgrFileLocator slocator, bool isRedo) { char *path; int ret; int save_errno; - path = relpath(rlocator, forknum); + path = smgrfilepath(slocator); /* * Truncate and then unlink the first segment, or just register a request * to unlink it later, as described in the comments for mdunlink(). */ - if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM || - RelFileLocatorBackendIsTemp(rlocator)) + if (isRedo || IsBinaryUpgrade || slocator.forknum != MAIN_FORKNUM || + SMgrFileLocatorIsTemp(slocator)) { - if (!RelFileLocatorBackendIsTemp(rlocator)) + if (!SMgrFileLocatorIsTemp(slocator)) { /* Prevent other backends' fds from holding on to the disk space */ ret = do_truncate(path); /* Forget any pending sync requests for the first segment */ save_errno = errno; - register_forget_request(rlocator, forknum, 0 /* first seg */ ); + register_forget_request(slocator, 0 /* first seg */ ); errno = save_errno; } else @@ -379,7 +367,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) /* Register request to unlink first segment later */ save_errno = errno; - register_unlink_segment(rlocator, forknum, 0 /* first seg */ ); + register_unlink_segment(slocator, 0 /* first seg */ ); errno = save_errno; } @@ -404,7 +392,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) { sprintf(segpath, "%s.%u", path, segno); - if (!RelFileLocatorBackendIsTemp(rlocator)) + if (!SMgrFileLocatorIsTemp(slocator)) { /* * Prevent other backends' fds from holding on to the disk @@ -417,7 +405,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) * Forget any pending sync requests for this segment before we * try to unlink. */ - register_forget_request(rlocator, forknum, segno); + register_forget_request(slocator, segno); } if (unlink(segpath) < 0) @@ -446,7 +434,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) * causes intervening file space to become filled with zeroes. */ void -mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +mdextend(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer, bool skipFsync) { off_t seekpos; @@ -455,7 +443,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND - Assert(blocknum >= mdnblocks(reln, forknum)); + Assert(blocknum >= mdnblocks(sfile)); #endif /* @@ -468,10 +456,10 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("cannot extend file \"%s\" beyond %u blocks", - relpath(reln->smgr_rlocator, forknum), + smgrfilepath(sfile->smgr_locator), InvalidBlockNumber))); - v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); + v = _mdfd_getseg(sfile, blocknum, skipFsync, EXTENSION_CREATE); seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); @@ -494,10 +482,10 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, errhint("Check free disk space."))); } - if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + if (!skipFsync && !SmgrIsTemp(sfile)) + register_dirty_segment(sfile, v); - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(sfile, v) <= ((BlockNumber) RELSEG_SIZE)); } /* @@ -511,17 +499,17 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * invent one out of whole cloth. */ static MdfdVec * -mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) +mdopenfork(SMgrFileHandle sfile, int behavior) { MdfdVec *mdfd; char *path; File fd; /* No work if already open */ - if (reln->md_num_open_segs[forknum] > 0) - return &reln->md_seg_fds[forknum][0]; + if (sfile->md_num_open_segs > 0) + return &sfile->md_seg_fds[0]; - path = relpath(reln->smgr_rlocator, forknum); + path = smgrfilepath(sfile->smgr_locator); fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); @@ -540,12 +528,12 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) pfree(path); - _fdvec_resize(reln, forknum, 1); - mdfd = &reln->md_seg_fds[forknum][0]; + _fdvec_resize(sfile, 1); + mdfd = &sfile->md_seg_fds[0]; mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; - Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(sfile, mdfd) <= ((BlockNumber) RELSEG_SIZE)); return mdfd; } @@ -554,20 +542,19 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) * mdopen() -- Initialize newly-opened relation. */ void -mdopen(SMgrRelation reln) +mdopen(SMgrFileHandle sfile) { /* mark it not open */ - for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) - reln->md_num_open_segs[forknum] = 0; + sfile->md_num_open_segs = 0; } /* * mdclose() -- Close the specified relation, if it isn't closed already. */ void -mdclose(SMgrRelation reln, ForkNumber forknum) +mdclose(SMgrFileHandle sfile) { - int nopensegs = reln->md_num_open_segs[forknum]; + int nopensegs = sfile->md_num_open_segs; /* No work if already closed */ if (nopensegs == 0) @@ -576,10 +563,10 @@ mdclose(SMgrRelation reln, ForkNumber forknum) /* close segments starting from the end */ while (nopensegs > 0) { - MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1]; + MdfdVec *v = &sfile->md_seg_fds[nopensegs - 1]; FileClose(v->mdfd_vfd); - _fdvec_resize(reln, forknum, nopensegs - 1); + _fdvec_resize(sfile, nopensegs - 1); nopensegs--; } } @@ -588,13 +575,13 @@ mdclose(SMgrRelation reln, ForkNumber forknum) * mdprefetch() -- Initiate asynchronous read of the specified block of a relation */ bool -mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +mdprefetch(SMgrFileHandle sfile, BlockNumber blocknum) { #ifdef USE_PREFETCH off_t seekpos; MdfdVec *v; - v = _mdfd_getseg(reln, forknum, blocknum, false, + v = _mdfd_getseg(sfile, blocknum, false, InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL); if (v == NULL) return false; @@ -616,8 +603,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) * considerably more efficient than doing so individually. */ void -mdwriteback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks) +mdwriteback(SMgrFileHandle sfile, BlockNumber blocknum, BlockNumber nblocks) { /* * Issue flush requests in as few requests as possible; have to split at @@ -631,7 +617,7 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, int segnum_start, segnum_end; - v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ , + v = _mdfd_getseg(sfile, blocknum, true /* not used */ , EXTENSION_DONT_OPEN); /* @@ -668,20 +654,21 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum, * mdread() -- Read the specified block from a relation. */ void -mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +mdread(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer) { off_t seekpos; int nbytes; MdfdVec *v; - TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber, - reln->smgr_rlocator.backend); + TRACE_POSTGRESQL_SMGR_MD_READ_START(sfile->smgr_forknum, + blocknum, + sfile->smgr_rlocator.locator.spcOid, + sfile->smgr_rlocator.locator.dbOid, + sfile->smgr_rlocator.locator.relNumber, + sfile->smgr_rlocator.backend); - v = _mdfd_getseg(reln, forknum, blocknum, false, + v = _mdfd_getseg(sfile, blocknum, false, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); @@ -690,11 +677,12 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ); - TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber, - reln->smgr_rlocator.backend, + TRACE_POSTGRESQL_SMGR_MD_READ_DONE(sfile->smgr_forknum, + blocknum, + sfile->smgr_rlocator.locator.spcOid, + sfile->smgr_rlocator.locator.dbOid, + sfile->smgr_rlocator.locator.relNumber, + sfile->smgr_rlocator.backend, nbytes, BLCKSZ); @@ -733,7 +721,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * use mdextend(). */ void -mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +mdwrite(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer, bool skipFsync) { off_t seekpos; @@ -742,16 +730,17 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND - Assert(blocknum < mdnblocks(reln, forknum)); + Assert(blocknum < mdnblocks(sfile)); #endif - TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber, - reln->smgr_rlocator.backend); + TRACE_POSTGRESQL_SMGR_MD_WRITE_START(sfile->smgr_forknum, + blocknum, + sfile->smgr_rlocator.locator.spcOid, + sfile->smgr_rlocator.locator.dbOid, + sfile->smgr_rlocator.locator.relNumber, + sfile->smgr_rlocator.backend); - v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, + v = _mdfd_getseg(sfile, blocknum, skipFsync, EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); @@ -760,11 +749,12 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); - TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, - reln->smgr_rlocator.locator.spcOid, - reln->smgr_rlocator.locator.dbOid, - reln->smgr_rlocator.locator.relNumber, - reln->smgr_rlocator.backend, + TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(sfile->smgr_forknum, + blocknum, + sfile->smgr_rlocator.locator.spcOid, + sfile->smgr_rlocator.locator.dbOid, + sfile->smgr_rlocator.locator.relNumber, + sfile->smgr_rlocator.backend, nbytes, BLCKSZ); @@ -785,8 +775,8 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, errhint("Check free disk space."))); } - if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + if (!skipFsync && !SmgrIsTemp(sfile)) + register_dirty_segment(sfile, v); } /* @@ -798,16 +788,16 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * are present in the array. */ BlockNumber -mdnblocks(SMgrRelation reln, ForkNumber forknum) +mdnblocks(SMgrFileHandle sfile) { MdfdVec *v; BlockNumber nblocks; BlockNumber segno; - mdopenfork(reln, forknum, EXTENSION_FAIL); + mdopenfork(sfile, EXTENSION_FAIL); /* mdopen has opened the first segment */ - Assert(reln->md_num_open_segs[forknum] > 0); + Assert(sfile->md_num_open_segs > 0); /* * Start from the last open segments, to avoid redundant seeks. We have @@ -822,12 +812,12 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) * that's OK because the checkpointer never needs to compute relation * size.) */ - segno = reln->md_num_open_segs[forknum] - 1; - v = &reln->md_seg_fds[forknum][segno]; + segno = sfile->md_num_open_segs - 1; + v = &sfile->md_seg_fds[segno]; for (;;) { - nblocks = _mdnblocks(reln, forknum, v); + nblocks = _mdnblocks(sfile, v); if (nblocks > ((BlockNumber) RELSEG_SIZE)) elog(FATAL, "segment too big"); if (nblocks < ((BlockNumber) RELSEG_SIZE)) @@ -845,7 +835,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) * undermines _mdfd_getseg's attempts to notice and report an error * upon access to a missing segment. */ - v = _mdfd_openseg(reln, forknum, segno, 0); + v = _mdfd_openseg(sfile, segno, 0); if (v == NULL) return segno * ((BlockNumber) RELSEG_SIZE); } @@ -855,7 +845,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) * mdtruncate() -- Truncate relation to specified number of blocks. */ void -mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +mdtruncate(SMgrFileHandle sfile, BlockNumber nblocks) { BlockNumber curnblk; BlockNumber priorblocks; @@ -865,7 +855,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * NOTE: mdnblocks makes sure we have opened all active segments, so that * truncation loop will get them all! */ - curnblk = mdnblocks(reln, forknum); + curnblk = mdnblocks(sfile); if (nblocks > curnblk) { /* Bogus request ... but no complaint if InRecovery */ @@ -873,7 +863,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) return; ereport(ERROR, (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now", - relpath(reln->smgr_rlocator, forknum), + smgrfilepath(sfile->smgr_locator), nblocks, curnblk))); } if (nblocks == curnblk) @@ -883,14 +873,14 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * Truncate segments, starting at the last one. Starting at the end makes * managing the memory for the fd array easier, should there be errors. */ - curopensegs = reln->md_num_open_segs[forknum]; + curopensegs = sfile->md_num_open_segs; while (curopensegs > 0) { MdfdVec *v; priorblocks = (curopensegs - 1) * RELSEG_SIZE; - v = &reln->md_seg_fds[forknum][curopensegs - 1]; + v = &sfile->md_seg_fds[curopensegs - 1]; if (priorblocks > nblocks) { @@ -904,14 +894,14 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) errmsg("could not truncate file \"%s\": %m", FilePathName(v->mdfd_vfd)))); - if (!SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + if (!SmgrIsTemp(sfile)) + register_dirty_segment(sfile, v); /* we never drop the 1st segment */ - Assert(v != &reln->md_seg_fds[forknum][0]); + Assert(v != &sfile->md_seg_fds[0]); FileClose(v->mdfd_vfd); - _fdvec_resize(reln, forknum, curopensegs - 1); + _fdvec_resize(sfile, curopensegs - 1); } else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) { @@ -930,8 +920,8 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) errmsg("could not truncate file \"%s\" to %u blocks: %m", FilePathName(v->mdfd_vfd), nblocks))); - if (!SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); + if (!SmgrIsTemp(sfile)) + register_dirty_segment(sfile, v); } else { @@ -957,7 +947,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) * segment may survive recovery, reintroducing unwanted data into the table. */ void -mdimmedsync(SMgrRelation reln, ForkNumber forknum) +mdimmedsync(SMgrFileHandle sfile) { int segno; int min_inactive_seg; @@ -966,9 +956,9 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) * NOTE: mdnblocks makes sure we have opened all active segments, so that * fsync loop will get them all! */ - mdnblocks(reln, forknum); + mdnblocks(sfile); - min_inactive_seg = segno = reln->md_num_open_segs[forknum]; + min_inactive_seg = segno = sfile->md_num_open_segs; /* * Temporarily open inactive segments, then close them after sync. There @@ -976,12 +966,12 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) * is harmless. We don't bother to clean them up and take a risk of * further trouble. The next mdclose() will soon close them. */ - while (_mdfd_openseg(reln, forknum, segno, 0) != NULL) + while (_mdfd_openseg(sfile, segno, 0) != NULL) segno++; while (segno > 0) { - MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; + MdfdVec *v = &sfile->md_seg_fds[segno - 1]; if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) ereport(data_sync_elevel(ERROR), @@ -993,7 +983,7 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) if (segno > min_inactive_seg) { FileClose(v->mdfd_vfd); - _fdvec_resize(reln, forknum, segno - 1); + _fdvec_resize(sfile, segno - 1); } segno--; @@ -1010,14 +1000,14 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) * enough to be a performance problem). */ static void -register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) +register_dirty_segment(SMgrFileHandle sfile, MdfdVec *seg) { FileTag tag; - INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno); + INIT_MD_FILETAG(tag, sfile->smgr_locator.locator, sfile->smgr_locator.forknum, seg->mdfd_segno); /* Temp relations should never be fsync'd */ - Assert(!SmgrIsTemp(reln)); + Assert(!SmgrIsTemp(sfile)); if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) { @@ -1036,15 +1026,14 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint */ static void -register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, - BlockNumber segno) +register_unlink_segment(SMgrFileLocator slocator, BlockNumber segno) { FileTag tag; - INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno); + INIT_MD_FILETAG(tag, slocator.locator, slocator.forknum, segno); /* Should never be used with temp relations */ - Assert(!RelFileLocatorBackendIsTemp(rlocator)); + Assert(!SMgrFileLocatorIsTemp(slocator)); RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ ); } @@ -1053,12 +1042,11 @@ register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, * register_forget_request() -- forget any fsyncs for a relation fork's segment */ static void -register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, - BlockNumber segno) +register_forget_request(SMgrFileLocator slocator, BlockNumber segno) { FileTag tag; - INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno); + INIT_MD_FILETAG(tag, slocator.locator, slocator.forknum, segno); RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ ); } @@ -1081,57 +1069,25 @@ ForgetDatabaseSyncRequests(Oid dbid) RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ ); } -/* - * DropRelationFiles -- drop files of all given relations - */ -void -DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo) -{ - SMgrRelation *srels; - int i; - - srels = palloc(sizeof(SMgrRelation) * ndelrels); - for (i = 0; i < ndelrels; i++) - { - SMgrRelation srel = smgropen(delrels[i], InvalidBackendId); - - if (isRedo) - { - ForkNumber fork; - - for (fork = 0; fork <= MAX_FORKNUM; fork++) - XLogDropRelation(delrels[i], fork); - } - srels[i] = srel; - } - - smgrdounlinkall(srels, ndelrels, isRedo); - - for (i = 0; i < ndelrels; i++) - smgrclose(srels[i]); - pfree(srels); -} - /* * _fdvec_resize() -- Resize the fork's open segments array */ static void -_fdvec_resize(SMgrRelation reln, - ForkNumber forknum, +_fdvec_resize(SMgrFileHandle sfile, int nseg) { if (nseg == 0) { - if (reln->md_num_open_segs[forknum] > 0) + if (sfile->md_num_open_segs > 0) { - pfree(reln->md_seg_fds[forknum]); - reln->md_seg_fds[forknum] = NULL; + pfree(sfile->md_seg_fds); + sfile->md_seg_fds = NULL; } } - else if (reln->md_num_open_segs[forknum] == 0) + else if (sfile->md_num_open_segs == 0) { - reln->md_seg_fds[forknum] = + sfile->md_seg_fds = MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg); } else @@ -1142,12 +1098,12 @@ _fdvec_resize(SMgrRelation reln, * FileClose(), and the memory context internally will sometimes avoid * doing an actual reallocation. */ - reln->md_seg_fds[forknum] = - repalloc(reln->md_seg_fds[forknum], + sfile->md_seg_fds = + repalloc(sfile->md_seg_fds, sizeof(MdfdVec) * nseg); } - reln->md_num_open_segs[forknum] = nseg; + sfile->md_num_open_segs = nseg; } /* @@ -1155,12 +1111,12 @@ _fdvec_resize(SMgrRelation reln, * returned string is palloc'd. */ static char * -_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) +_mdfd_segpath(SMgrFileHandle sfile, BlockNumber segno) { char *path, *fullpath; - path = relpath(reln->smgr_rlocator, forknum); + path = smgrfilepath(sfile->smgr_locator); if (segno > 0) { @@ -1178,14 +1134,14 @@ _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) * and make a MdfdVec object for it. Returns NULL on failure. */ static MdfdVec * -_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, +_mdfd_openseg(SMgrFileHandle sfile, BlockNumber segno, int oflags) { MdfdVec *v; File fd; char *fullpath; - fullpath = _mdfd_segpath(reln, forknum, segno); + fullpath = _mdfd_segpath(sfile, segno); /* open the file */ fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags); @@ -1199,16 +1155,16 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, * Segments are always opened in order from lowest to highest, so we must * be adding a new one at the end. */ - Assert(segno == reln->md_num_open_segs[forknum]); + Assert(segno == sfile->md_num_open_segs); - _fdvec_resize(reln, forknum, segno + 1); + _fdvec_resize(sfile, segno + 1); /* fill the entry */ - v = &reln->md_seg_fds[forknum][segno]; + v = &sfile->md_seg_fds[segno]; v->mdfd_vfd = fd; v->mdfd_segno = segno; - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + Assert(_mdnblocks(sfile, v) <= ((BlockNumber) RELSEG_SIZE)); /* all done */ return v; @@ -1223,7 +1179,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, * EXTENSION_CREATE case. */ static MdfdVec * -_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, +_mdfd_getseg(SMgrFileHandle sfile, BlockNumber blkno, bool skipFsync, int behavior) { MdfdVec *v; @@ -1238,9 +1194,9 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, targetseg = blkno / ((BlockNumber) RELSEG_SIZE); /* if an existing and opened segment, we're done */ - if (targetseg < reln->md_num_open_segs[forknum]) + if (targetseg < sfile->md_num_open_segs) { - v = &reln->md_seg_fds[forknum][targetseg]; + v = &sfile->md_seg_fds[targetseg]; return v; } @@ -1255,19 +1211,19 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, * 'behavior'). Start with either the last opened, or the first segment if * none was opened before. */ - if (reln->md_num_open_segs[forknum] > 0) - v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1]; + if (sfile->md_num_open_segs > 0) + v = &sfile->md_seg_fds[sfile->md_num_open_segs - 1]; else { - v = mdopenfork(reln, forknum, behavior); + v = mdopenfork(sfile, behavior); if (!v) return NULL; /* if behavior & EXTENSION_RETURN_NULL */ } - for (nextsegno = reln->md_num_open_segs[forknum]; + for (nextsegno = sfile->md_num_open_segs; nextsegno <= targetseg; nextsegno++) { - BlockNumber nblocks = _mdnblocks(reln, forknum, v); + BlockNumber nblocks = _mdnblocks(sfile, v); int flags = 0; Assert(nextsegno == v->mdfd_segno + 1); @@ -1296,7 +1252,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, { char *zerobuf = palloc0(BLCKSZ); - mdextend(reln, forknum, + mdextend(sfile, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, zerobuf, skipFsync); pfree(zerobuf); @@ -1327,11 +1283,11 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks", - _mdfd_segpath(reln, forknum, nextsegno), + _mdfd_segpath(sfile, nextsegno), blkno, nblocks))); } - v = _mdfd_openseg(reln, forknum, nextsegno, flags); + v = _mdfd_openseg(sfile, nextsegno, flags); if (v == NULL) { @@ -1341,7 +1297,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (target block %u): %m", - _mdfd_segpath(reln, forknum, nextsegno), + _mdfd_segpath(sfile, nextsegno), blkno))); } } @@ -1353,7 +1309,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, * Get number of blocks present in a single disk file */ static BlockNumber -_mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) +_mdnblocks(SMgrFileHandle sfile, MdfdVec *seg) { off_t len; @@ -1376,16 +1332,16 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) int mdsyncfiletag(const FileTag *ftag, char *path) { - SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId); + SMgrFileHandle sfile = smgropen(ftag->rlocator, InvalidBackendId, ftag->forknum); File file; bool need_to_close; int result, save_errno; /* See if we already have the file open, or need to open it. */ - if (ftag->segno < reln->md_num_open_segs[ftag->forknum]) + if (ftag->segno < sfile->md_num_open_segs) { - file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd; + file = sfile->md_seg_fds[ftag->segno].mdfd_vfd; strlcpy(path, FilePathName(file), MAXPGPATH); need_to_close = false; } @@ -1393,7 +1349,7 @@ mdsyncfiletag(const FileTag *ftag, char *path) { char *p; - p = _mdfd_segpath(reln, ftag->forknum, ftag->segno); + p = _mdfd_segpath(sfile, ftag->segno); strlcpy(path, p, MAXPGPATH); pfree(p); diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index c1a5febcbfd..de103320574 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -41,27 +41,24 @@ typedef struct f_smgr { void (*smgr_init) (void); /* may be NULL */ void (*smgr_shutdown) (void); /* may be NULL */ - void (*smgr_open) (SMgrRelation reln); - void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, - bool isRedo); - bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, - bool isRedo); - void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, + void (*smgr_open) (SMgrFileHandle sfile); + void (*smgr_close) (SMgrFileHandle sfile); + void (*smgr_create) (SMgrFileHandle sfile, bool isRedo); + bool (*smgr_exists) (SMgrFileHandle sfile); + void (*smgr_unlink) (SMgrFileLocator slocator, bool isRedo); + void (*smgr_extend) (SMgrFileHandle sfile, BlockNumber blocknum, char *buffer, bool skipFsync); - bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, + bool (*smgr_prefetch) (SMgrFileHandle sfile, BlockNumber blocknum); - void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, + void (*smgr_read) (SMgrFileHandle sfile, BlockNumber blocknum, char *buffer); - void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, + void (*smgr_write) (SMgrFileHandle sfile, BlockNumber blocknum, char *buffer, bool skipFsync); - void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, + void (*smgr_writeback) (SMgrFileHandle sfile, BlockNumber blocknum, BlockNumber nblocks); - BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); - void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); + BlockNumber (*smgr_nblocks) (SMgrFileHandle sfile); + void (*smgr_truncate) (SMgrFileHandle sfile, BlockNumber nblocks); + void (*smgr_immedsync) (SMgrFileHandle sfile); } f_smgr; static const f_smgr smgrsw[] = { @@ -88,12 +85,12 @@ static const f_smgr smgrsw[] = { static const int NSmgr = lengthof(smgrsw); /* - * Each backend has a hashtable that stores all extant SMgrRelation objects. - * In addition, "unowned" SMgrRelation objects are chained together in a list. + * Each backend has a hashtable that stores all extant SMgrFileData objects. + * In addition, "unowned" SMgrFile objects are chained together in a list. */ -static HTAB *SMgrRelationHash = NULL; +static HTAB *SMgrFileHash = NULL; -static dlist_head unowned_relns; +static dlist_head unowned_sfiles; /* local function prototypes */ static void smgrshutdown(int code, Datum arg); @@ -142,50 +139,50 @@ smgrshutdown(int code, Datum arg) * * This does not attempt to actually open the underlying file. */ -SMgrRelation -smgropen(RelFileLocator rlocator, BackendId backend) +SMgrFileHandle +smgropen(RelFileLocator rlocator, BackendId backend, ForkNumber forkNum) { - RelFileLocatorBackend brlocator; - SMgrRelation reln; + SMgrFileLocator slocator; + SMgrFileHandle sfile; bool found; - if (SMgrRelationHash == NULL) + if (SMgrFileHash == NULL) { /* First time through: initialize the hash table */ HASHCTL ctl; - ctl.keysize = sizeof(RelFileLocatorBackend); - ctl.entrysize = sizeof(SMgrRelationData); - SMgrRelationHash = hash_create("smgr relation table", 400, + ctl.keysize = sizeof(SMgrFileLocator); + ctl.entrysize = sizeof(SMgrFileData); + SMgrFileHash = hash_create("smgr relation table", 400, &ctl, HASH_ELEM | HASH_BLOBS); - dlist_init(&unowned_relns); + dlist_init(&unowned_sfiles); } /* Look up or create an entry */ - brlocator.locator = rlocator; - brlocator.backend = backend; - reln = (SMgrRelation) hash_search(SMgrRelationHash, - (void *) &brlocator, - HASH_ENTER, &found); + slocator.locator = rlocator; + slocator.backend = backend; + slocator.forknum = forkNum; + sfile = (SMgrFileHandle) hash_search(SMgrFileHash, + (void *) &slocator, + HASH_ENTER, &found); /* Initialize it if not present before */ if (!found) { /* hash_search already filled in the lookup key */ - reln->smgr_owner = NULL; - reln->smgr_targblock = InvalidBlockNumber; - for (int i = 0; i <= MAX_FORKNUM; ++i) - reln->smgr_cached_nblocks[i] = InvalidBlockNumber; - reln->smgr_which = 0; /* we only have md.c at present */ + sfile->smgr_owner = NULL; + sfile->smgr_targblock = InvalidBlockNumber; + sfile->smgr_cached_nblocks = InvalidBlockNumber; + sfile->smgr_which = 0; /* we only have md.c at present */ /* implementation-specific initialization */ - smgrsw[reln->smgr_which].smgr_open(reln); + smgrsw[sfile->smgr_which].smgr_open(sfile); /* it has no owner yet */ - dlist_push_tail(&unowned_relns, &reln->node); + dlist_push_tail(&unowned_sfiles, &sfile->node); } - return reln; + return sfile; } /* @@ -195,7 +192,7 @@ smgropen(RelFileLocator rlocator, BackendId backend) * the only such owners exist in the relcache. */ void -smgrsetowner(SMgrRelation *owner, SMgrRelation reln) +smgrsetowner(SMgrFileHandle *owner, SMgrFileHandle sfile) { /* We don't support "disowning" an SMgrRelation here, use smgrclearowner */ Assert(owner != NULL); @@ -206,68 +203,66 @@ smgrsetowner(SMgrRelation *owner, SMgrRelation reln) * depending on the order of processing. It's ok to close the old * relcache entry early in that case.) * - * If there isn't an old owner, then the reln should be in the unowned + * If there isn't an old owner, then the sfile should be in the unowned * list, and we need to remove it. */ - if (reln->smgr_owner) - *(reln->smgr_owner) = NULL; + if (sfile->smgr_owner) + *(sfile->smgr_owner) = NULL; else - dlist_delete(&reln->node); + dlist_delete(&sfile->node); /* Now establish the ownership relationship. */ - reln->smgr_owner = owner; - *owner = reln; + sfile->smgr_owner = owner; + *owner = sfile; } /* - * smgrclearowner() -- Remove long-lived reference to an SMgrRelation object + * smgrclearowner() -- Remove long-lived reference to an SMgrFileHandle object * if one exists */ void -smgrclearowner(SMgrRelation *owner, SMgrRelation reln) +smgrclearowner(SMgrFileHandle *owner, SMgrFileHandle sfile) { /* Do nothing if the SMgrRelation object is not owned by the owner */ - if (reln->smgr_owner != owner) + if (sfile->smgr_owner != owner) return; /* unset the owner's reference */ *owner = NULL; /* unset our reference to the owner */ - reln->smgr_owner = NULL; + sfile->smgr_owner = NULL; /* add to list of unowned relations */ - dlist_push_tail(&unowned_relns, &reln->node); + dlist_push_tail(&unowned_sfiles, &sfile->node); } /* - * smgrexists() -- Does the underlying file for a fork exist? + * smgrexists() -- Does the underlying file exist? */ bool -smgrexists(SMgrRelation reln, ForkNumber forknum) +smgrexists(SMgrFileHandle sfile) { - return smgrsw[reln->smgr_which].smgr_exists(reln, forknum); + return smgrsw[sfile->smgr_which].smgr_exists(sfile); } /* - * smgrclose() -- Close and delete an SMgrRelation object. + * smgrclose() -- Close and delete an SMgrFile object. */ void -smgrclose(SMgrRelation reln) +smgrclose(SMgrFileHandle sfile) { - SMgrRelation *owner; - ForkNumber forknum; + SMgrFileHandle *owner; - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[reln->smgr_which].smgr_close(reln, forknum); + smgrsw[sfile->smgr_which].smgr_close(sfile); - owner = reln->smgr_owner; + owner = sfile->smgr_owner; if (!owner) - dlist_delete(&reln->node); + dlist_delete(&sfile->node); - if (hash_search(SMgrRelationHash, - (void *) &(reln->smgr_rlocator), + if (hash_search(SMgrFileHash, + (void *) &(sfile->smgr_locator), HASH_REMOVE, NULL) == NULL) elog(ERROR, "SMgrRelation hashtable corrupted"); @@ -284,14 +279,11 @@ smgrclose(SMgrRelation reln) * * The object remains valid. */ -void -smgrrelease(SMgrRelation reln) +static void +smgrrelease(SMgrFileHandle sfile) { - for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++) - { - smgrsw[reln->smgr_which].smgr_close(reln, forknum); - reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; - } + smgrsw[sfile->smgr_which].smgr_close(sfile); + sfile->smgr_cached_nblocks = InvalidBlockNumber; } /* @@ -303,16 +295,16 @@ void smgrreleaseall(void) { HASH_SEQ_STATUS status; - SMgrRelation reln; + SMgrFileHandle sfile; /* Nothing to do if hashtable not set up */ - if (SMgrRelationHash == NULL) + if (SMgrFileHash == NULL) return; - hash_seq_init(&status, SMgrRelationHash); + hash_seq_init(&status, SMgrFileHash); - while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL) - smgrrelease(reln); + while ((sfile = (SMgrFileHandle) hash_seq_search(&status)) != NULL) + smgrrelease(sfile); } /* @@ -322,16 +314,16 @@ void smgrcloseall(void) { HASH_SEQ_STATUS status; - SMgrRelation reln; + SMgrFileHandle sfile; /* Nothing to do if hashtable not set up */ - if (SMgrRelationHash == NULL) + if (SMgrFileHash == NULL) return; - hash_seq_init(&status, SMgrRelationHash); + hash_seq_init(&status, SMgrFileHash); - while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL) - smgrclose(reln); + while ((sfile = (SMgrFileHandle) hash_seq_search(&status)) != NULL) + smgrclose(sfile); } /* @@ -343,111 +335,112 @@ smgrcloseall(void) * such entry exists already. */ void -smgrcloserellocator(RelFileLocatorBackend rlocator) +smgrcloserellocator(RelFileLocator rlocator, BackendId backend) { - SMgrRelation reln; + SMgrFileHandle sfile; /* Nothing to do if hashtable not set up */ - if (SMgrRelationHash == NULL) + if (SMgrFileHash == NULL) return; - reln = (SMgrRelation) hash_search(SMgrRelationHash, - (void *) &rlocator, - HASH_FIND, NULL); - if (reln != NULL) - smgrclose(reln); + for (int i = 0; i <= MAX_FORKNUM; i++) + { + SMgrFileLocator slocator = { rlocator, backend, i }; + + sfile = (SMgrFileHandle) hash_search(SMgrFileHash, + (void *) &slocator, + HASH_FIND, NULL); + if (sfile != NULL) + smgrclose(sfile); + } } /* - * smgrcreate() -- Create a new relation. + * smgrcreate() -- Create a new file. * - * Given an already-created (but presumably unused) SMgrRelation, + * Given an already-created (but presumably unused) SMgrFileHandle, * cause the underlying disk file or other storage for the fork * to be created. */ void -smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) +smgrcreate(SMgrFileHandle sfile, bool isRedo) { - smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo); + smgrsw[sfile->smgr_which].smgr_create(sfile, isRedo); } /* - * smgrdosyncall() -- Immediately sync all forks of all given relations + * smgrunlink_multi() -- Immediately unlink given forks of given relation * - * All forks of all given relations are synced out to the store. + * The given forks of the relation are removed from the store. This + * should not be used during transactional operations, since it can't be + * undone. + * + * This handles multiple forks in one call, because the cache invalidation + * happens at relation granularity. If we had an smgrunlink() function + * to unlink just a single smgr file, and the caller wanted to delete + * multiple forks of a single relation, each call would send a new + * cache invalidation event, which would be wasteful. * - * This is equivalent to FlushRelationBuffers() for each smgr relation, - * then calling smgrimmedsync() for all forks of each relation, but it's - * significantly quicker so should be preferred when possible. + * If isRedo is true, it is okay for the underlying file(s) to be gone + * already. */ void -smgrdosyncall(SMgrRelation *rels, int nrels) +smgrunlink_multi(RelFileLocator rlocator, BackendId backend, ForkNumber *forks, int nforks, bool isRedo) { + int which; int i = 0; - ForkNumber forknum; - if (nrels == 0) - return; + which = 0; /* we only have md.c at present */ - FlushRelationsAllBuffers(rels, nrels); + /* Close the forks at smgr level */ + smgrcloserellocator(rlocator, backend); /* - * Sync the physical file(s). + * Send a shared-inval message to force other backends to close any + * dangling smgr references they may have for these rels. We should do + * this before starting the actual unlinking, in case we fail partway + * through that step. Note that the sinval messages will eventually come + * back to this backend, too, and thereby provide a backstop that we + * closed our own smgr rel. */ - for (i = 0; i < nrels; i++) - { - int which = rels[i]->smgr_which; + CacheInvalidateSmgr(rlocator, backend); - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - { - if (smgrsw[which].smgr_exists(rels[i], forknum)) - smgrsw[which].smgr_immedsync(rels[i], forknum); - } + /* + * Delete the physical file(s). + * + * Note: smgr_unlink must treat deletion failure as a WARNING, not an + * ERROR, because we've already decided to commit or abort the current + * xact. + */ + for (i = 0; i < nforks; i++) + { + SMgrFileLocator slocator = { rlocator, backend, forks[i] }; + smgrsw[which].smgr_unlink(slocator, isRedo); } } /* - * smgrdounlinkall() -- Immediately unlink all forks of all given relations - * - * All forks of all given relations are removed from the store. This - * should not be used during transactional operations, since it can't be - * undone. + * smgrdounlink() -- Immediately unlink a file * * If isRedo is true, it is okay for the underlying file(s) to be gone * already. + * + * To remove a relation transactionally, see RelationDropStorage() instead. + * This will cause cache invalidation of all forks of the relation, not just + * this one. */ void -smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) +smgrunlink(SMgrFileHandle sfile, bool isRedo) { - int i = 0; - RelFileLocatorBackend *rlocators; - ForkNumber forknum; - - if (nrels == 0) - return; - - /* - * Get rid of any remaining buffers for the relations. bufmgr will just - * drop them without bothering to write the contents. - */ - DropRelationsAllBuffers(rels, nrels); - - /* - * create an array which contains all relations to be dropped, and close - * each relation's forks at the smgr level while at it - */ - rlocators = palloc(sizeof(RelFileLocatorBackend) * nrels); - for (i = 0; i < nrels; i++) - { - RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator; - int which = rels[i]->smgr_which; + SMgrFileLocator locator; + int which; - rlocators[i] = rlocator; + /* remember before closing */ + which = sfile->smgr_which; + locator = sfile->smgr_locator; - /* Close the forks at smgr level */ - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_close(rels[i], forknum); - } + /* Close the file at smgr level */ + smgrclose(sfile); /* * Send a shared-inval message to force other backends to close any @@ -457,8 +450,7 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) * back to this backend, too, and thereby provide a backstop that we * closed our own smgr rel. */ - for (i = 0; i < nrels; i++) - CacheInvalidateSmgr(rlocators[i]); + CacheInvalidateSmgr(locator.locator, locator.backend); /* * Delete the physical file(s). @@ -467,16 +459,7 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) * ERROR, because we've already decided to commit or abort the current * xact. */ - - for (i = 0; i < nrels; i++) - { - int which = rels[i]->smgr_which; - - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo); - } - - pfree(rlocators); + smgrsw[which].smgr_unlink(locator, isRedo); } @@ -490,21 +473,21 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) * causes intervening file space to become filled with zeroes. */ void -smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +smgrextend(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum, - buffer, skipFsync); + smgrsw[sfile->smgr_which].smgr_extend(sfile, blocknum, + buffer, skipFsync); /* * Normally we expect this to increase nblocks by one, but if the cached * value isn't as expected, just invalidate it so the next call asks the * kernel. */ - if (reln->smgr_cached_nblocks[forknum] == blocknum) - reln->smgr_cached_nblocks[forknum] = blocknum + 1; + if (sfile->smgr_cached_nblocks == blocknum) + sfile->smgr_cached_nblocks = blocknum + 1; else - reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; + sfile->smgr_cached_nblocks = InvalidBlockNumber; } /* @@ -515,13 +498,13 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * record). */ bool -smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +smgrprefetch(SMgrFileHandle sfile, BlockNumber blocknum) { - return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum); + return smgrsw[sfile->smgr_which].smgr_prefetch(sfile, blocknum); } /* - * smgrread() -- read a particular block from a relation into the supplied + * smgrread() -- read a particular block from a file into the supplied * buffer. * * This routine is called from the buffer manager in order to @@ -529,10 +512,9 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) * return pages in the format that POSTGRES expects. */ void -smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - char *buffer) +smgrread(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer) { - smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer); + smgrsw[sfile->smgr_which].smgr_read(sfile, blocknum, buffer); } /* @@ -551,11 +533,11 @@ smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * do not require fsync. */ void -smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +smgrwrite(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, - buffer, skipFsync); + smgrsw[sfile->smgr_which].smgr_write(sfile, blocknum, + buffer, skipFsync); } @@ -564,11 +546,11 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * blocks. */ void -smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +smgrwriteback(SMgrFileHandle sfile, BlockNumber blocknum, BlockNumber nblocks) { - smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum, - nblocks); + smgrsw[sfile->smgr_which].smgr_writeback(sfile, blocknum, + nblocks); } /* @@ -576,18 +558,18 @@ smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * supplied relation. */ BlockNumber -smgrnblocks(SMgrRelation reln, ForkNumber forknum) +smgrnblocks(SMgrFileHandle sfile) { BlockNumber result; /* Check and return if we get the cached value for the number of blocks. */ - result = smgrnblocks_cached(reln, forknum); + result = smgrnblocks_cached(sfile); if (result != InvalidBlockNumber) return result; - result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum); + result = smgrsw[sfile->smgr_which].smgr_nblocks(sfile); - reln->smgr_cached_nblocks[forknum] = result; + sfile->smgr_cached_nblocks = result; return result; } @@ -600,38 +582,41 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum) * fork size is not cached. */ BlockNumber -smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum) +smgrnblocks_cached(SMgrFileHandle sfile) { /* * For now, we only use cached values in recovery due to lack of a shared * invalidation mechanism for changes in file size. */ - if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber) - return reln->smgr_cached_nblocks[forknum]; + if (InRecovery && sfile->smgr_cached_nblocks != InvalidBlockNumber) + return sfile->smgr_cached_nblocks; return InvalidBlockNumber; } /* - * smgrtruncate() -- Truncate the given forks of supplied relation to - * each specified numbers of blocks + * smgrtruncate_multi() -- Truncate the given forks of supplied relation to + * each specified numbers of blocks * * The truncation is done immediately, so this can't be rolled back. * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access any forks of the relation again. + * + * Like smgrunlink_multi(), this handles multiple forks in one call because + * the cache invalidation happens at relation granularity. + * + * NB: The caller is responsible for dropping buffers! Before v16, this + * function did it. */ void -smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks) +smgrtruncate_multi(RelFileLocator rlocator, BackendId backend, ForkNumber *forks, + int nforks, BlockNumber *nblocks) { int i; - /* - * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will - * just drop them without bothering to write the contents. - */ - DropRelationBuffers(reln, forknum, nforks, nblocks); + Assert(nforks < MAX_FORKNUM + 1); /* * Send a shared-inval message to force other backends to close any smgr @@ -643,15 +628,19 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb * is a performance-critical path.) As in the unlink code, we want to be * sure the message is sent before we start changing things on-disk. */ - CacheInvalidateSmgr(reln->smgr_rlocator); + CacheInvalidateSmgr(rlocator, backend); - /* Do the truncation */ + /* Do the truncations */ for (i = 0; i < nforks; i++) { + SMgrFileHandle sfile; + + sfile = smgropen(rlocator, backend, forks[i]); + /* Make the cached size is invalid if we encounter an error. */ - reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber; + sfile->smgr_cached_nblocks = InvalidBlockNumber; - smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]); + smgrsw[sfile->smgr_which].smgr_truncate(sfile, nblocks[i]); /* * We might as well update the local smgr_cached_nblocks values. The @@ -660,7 +649,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb * smgr_vm_nblocks, and these ones too at the next command boundary. * But these ensure they aren't outright wrong until then. */ - reln->smgr_cached_nblocks[forknum[i]] = nblocks[i]; + sfile->smgr_cached_nblocks = nblocks[i]; } } @@ -688,9 +677,9 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb * otherwise the sync is not very meaningful. */ void -smgrimmedsync(SMgrRelation reln, ForkNumber forknum) +smgrimmedsync(SMgrFileHandle sfile) { - smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum); + smgrsw[sfile->smgr_which].smgr_immedsync(sfile); } /* @@ -714,14 +703,14 @@ AtEOXact_SMgr(void) * Zap all unowned SMgrRelations. We rely on smgrclose() to remove each * one from the list. */ - dlist_foreach_modify(iter, &unowned_relns) + dlist_foreach_modify(iter, &unowned_sfiles) { - SMgrRelation rel = dlist_container(SMgrRelationData, node, - iter.cur); + SMgrFileHandle sfile = dlist_container(SMgrFileData, node, + iter.cur); - Assert(rel->smgr_owner == NULL); + Assert(sfile->smgr_owner == NULL); - smgrclose(rel); + smgrclose(sfile); } } diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c index 9d6a9e91090..3572078ba79 100644 --- a/src/backend/storage/sync/sync.c +++ b/src/backend/storage/sync/sync.c @@ -18,9 +18,7 @@ #include #include -#include "access/commit_ts.h" -#include "access/clog.h" -#include "access/multixact.h" +#include "access/slru.h" #include "access/xlog.h" #include "access/xlogutils.h" #include "commands/tablespace.h" @@ -106,22 +104,6 @@ static const SyncOps syncsw[] = { .sync_unlinkfiletag = mdunlinkfiletag, .sync_filetagmatches = mdfiletagmatches }, - /* pg_xact */ - [SYNC_HANDLER_CLOG] = { - .sync_syncfiletag = clogsyncfiletag - }, - /* pg_commit_ts */ - [SYNC_HANDLER_COMMIT_TS] = { - .sync_syncfiletag = committssyncfiletag - }, - /* pg_multixact/offsets */ - [SYNC_HANDLER_MULTIXACT_OFFSET] = { - .sync_syncfiletag = multixactoffsetssyncfiletag - }, - /* pg_multixact/members */ - [SYNC_HANDLER_MULTIXACT_MEMBER] = { - .sync_syncfiletag = multixactmemberssyncfiletag - } }; /* diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c index eb5782f82a4..bd435215dca 100644 --- a/src/backend/utils/cache/inval.c +++ b/src/backend/utils/cache/inval.c @@ -661,11 +661,12 @@ LocalExecuteInvalidationMessage(SharedInvalidationMessage *msg) * We could have smgr entries for relations of other databases, so no * short-circuit test is possible here. */ - RelFileLocatorBackend rlocator; + RelFileLocator rlocator; + BackendId backend; - rlocator.locator = msg->sm.rlocator; - rlocator.backend = (msg->sm.backend_hi << 16) | (int) msg->sm.backend_lo; - smgrcloserellocator(rlocator); + rlocator = msg->sm.rlocator; + backend = (msg->sm.backend_hi << 16) | (int) msg->sm.backend_lo; + smgrcloserellocator(rlocator, backend); } else if (msg->id == SHAREDINVALRELMAP_ID) { @@ -1459,14 +1460,14 @@ CacheInvalidateRelcacheByRelid(Oid relid) * Thus, the maximum possible backend ID is 2^23-1. */ void -CacheInvalidateSmgr(RelFileLocatorBackend rlocator) +CacheInvalidateSmgr(RelFileLocator rlocator, BackendId backend) { SharedInvalidationMessage msg; msg.sm.id = SHAREDINVALSMGR_ID; - msg.sm.backend_hi = rlocator.backend >> 16; - msg.sm.backend_lo = rlocator.backend & 0xffff; - msg.sm.rlocator = rlocator.locator; + msg.sm.backend_hi = backend >> 16; + msg.sm.backend_lo = backend & 0xffff; + msg.sm.rlocator = rlocator; /* check AddCatcacheInvalidationMessage() for an explanation */ VALGRIND_MAKE_MEM_DEFINED(&msg, sizeof(msg)); diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 450e5124a5a..9e5cd7922ce 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -419,7 +419,7 @@ AllocateRelationDesc(Form_pg_class relp) relation = (Relation) palloc0(sizeof(RelationData)); /* make sure relation is marked as having no open file yet */ - relation->rd_smgr = NULL; + MemSet(relation->rd_smgr, 0, sizeof(relation->rd_smgr)); /* * Copy the relation tuple form @@ -1248,7 +1248,7 @@ retry: RelationInitPhysicalAddr(relation); /* make sure relation is marked as having no open file yet */ - relation->rd_smgr = NULL; + MemSet(relation->rd_smgr, 0, sizeof(relation->rd_smgr)); /* * now we can free the memory allocated for pg_class_tuple @@ -1877,7 +1877,7 @@ formrdesc(const char *relationName, Oid relationReltype, relation = (Relation) palloc0(sizeof(RelationData)); /* make sure relation is marked as having no open file yet */ - relation->rd_smgr = NULL; + MemSet(relation->rd_smgr, 0, sizeof(relation->rd_smgr)); /* * initialize reference count: 1 because it is nailed in cache @@ -2701,7 +2701,8 @@ RelationClearRelation(Relation relation, bool rebuild) } /* rd_smgr must not be swapped, due to back-links from smgr level */ - SWAPFIELD(SMgrRelation, rd_smgr); + for (int i = 0; i <= MAX_FORKNUM; i++) + SWAPFIELD(SMgrFileHandle, rd_smgr[i]); /* rd_refcnt must be preserved */ SWAPFIELD(int, rd_refcnt); /* isnailed shouldn't change */ @@ -3532,7 +3533,7 @@ RelationBuildLocalRelation(const char *relname, rel = (Relation) palloc0(sizeof(RelationData)); /* make sure relation is marked as having no open file yet */ - rel->rd_smgr = NULL; + MemSet(rel->rd_smgr, 0, sizeof(rel->rd_smgr)); /* mark it nailed if appropriate */ rel->rd_isnailed = nailit; @@ -3764,7 +3765,6 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) */ if (IsBinaryUpgrade) { - SMgrRelation srel; /* * During a binary upgrade, we use this code path to ensure that @@ -3781,9 +3781,14 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) * fails at this stage, the new cluster will need to be recreated * anyway. */ - srel = smgropen(relation->rd_locator, relation->rd_backend); - smgrdounlinkall(&srel, 1, false); - smgrclose(srel); + ForkNumber forks[MAX_FORKNUM + 1]; + for (int i = 0; i <= MAX_FORKNUM; i ++) + { + smgropen(relation->rd_locator, relation->rd_backend, i); + forks[i] = i; + } + + smgrunlink_multi(relation->rd_locator, relation->rd_backend, forks, MAX_FORKNUM + 1, false); } else { @@ -3811,7 +3816,7 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) { /* handle these directly, at least for now */ - SMgrRelation srel; + SMgrFileHandle srel; srel = RelationCreateStorage(newrlocator, persistence, true); smgrclose(srel); @@ -6298,7 +6303,7 @@ load_relcache_init_file(bool shared) /* * Reset transient-state fields in the relcache entry */ - rel->rd_smgr = NULL; + MemSet(rel->rd_smgr, 0, sizeof(rel->rd_smgr)); if (rel->rd_isnailed) rel->rd_refcnt = 1; else diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c index 57bd6690ca0..f9137e36411 100644 --- a/src/backend/utils/mmgr/mcxt.c +++ b/src/backend/utils/mmgr/mcxt.c @@ -151,9 +151,20 @@ static void MemoryContextStatsPrint(MemoryContext context, void *passthru, * You should not do memory allocations within a critical section, because * an out-of-memory error will be escalated to a PANIC. To enforce that * rule, the allocation functions Assert that. + * + * FIXME: bypass this for the critical section in RecordTransactionCommit() + * for now. It does a lot of things that can allocate: + * - calls TransactionIdCommitTree, which pins buffers, which requires + * space in the ResourceOwner for the pin (ResourceOwnerEnlargeBuffers()) + * - same for TransactionTreeSetCommitTsData() call. + * - reading a page can require flushing other pages, which in turn + * can call CompactCheckpointerRequestQueue(), which allocates + * - reading a page calls smgropen(), which allocates the SMgrFile entry + * if it's not open already */ #define AssertNotInCriticalSection(context) \ - Assert(CritSectionCount == 0 || (context)->allowInCritSection) + Assert(CritSectionCount == 0 || (context)->allowInCritSection || \ + (MyProc != NULL && (MyProc->delayChkptFlags & DELAY_CHKPT_START != 0))) /* * Call the given function in the MemoryContextMethods for the memory context diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index 079fbda8389..6b76a6bf7e5 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -20,6 +20,7 @@ #endif #include "access/visibilitymapdefs.h" +#include "access/slrudefs.h" #include "common/file_perm.h" #include "pg_upgrade.h" #include "storage/bufpage.h" @@ -139,7 +140,6 @@ copyFile(const char *src, const char *dst, #endif /* WIN32 */ } - /* * linkFile() * @@ -316,6 +316,179 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, close(src_fd); } + +//create new file and initialize to 256K +static int +init_new_slru_file(FILE * fp) +{ + int fd; + int res; + + fd = fileno(fp); + res = ftruncate(fd, BLCKSZ * SLRU_PAGES_PER_SEGMENT); + return res; +} + +int +copy_to_new_format(const char *old_subdir, const char *new_subdir, int element_size) +{ + DIR *dr; + struct dirent *de; + + size_t read_items; + size_t n_items; + size_t to_read_items; + size_t write_items; + size_t pages_written; + size_t write_offset; + + char * read_file; + char * write_file; + + int res; + int write_file_segno; + int total_read_files; + + FILE * old_fd; + FILE * new_fp; + +#define MAXBUFSIZE (BLCKSZ - SizeOfPageHeaderData) + + char old_path[MAXPGPATH]; + char new_path[MAXPGPATH]; + PGAlignedBlock pg_buf; + BlockNumber blkno; //track aboslute block number + + + struct dirent** all_dirents; + + + snprintf(old_path, sizeof(old_path), "%s/%s", old_cluster.pgdata, old_subdir); + snprintf(new_path, sizeof(new_path), "%s/%s", new_cluster.pgdata, new_subdir); + + /* + * calculate the number of items that can fit + * inside BLCKSZ - SizeOfPageHeaderData + */ + + n_items = (size_t) (MAXBUFSIZE / element_size); + + /* number of items to read in at a time */ + to_read_items = n_items; + + //bufsize = (size_t) (n_items * element_size); + write_file_segno = 0; /* track which segment we are writing into */ + write_offset = (size_t) (SizeOfPageHeaderData); + + dr = opendir((char *)old_path); + if (dr == NULL) + return -1; + + write_file = psprintf("%s/%04X", new_path, write_file_segno); + + new_fp = fopen(write_file, "wb"); + + if (new_fp == NULL) + return -1; + + res = init_new_slru_file(new_fp); + if (res < 0) + return -1; + + if (res < 0) + return -1; + + pages_written = 0; + blkno = 0; /* absolute blkno */ + + all_dirents = get_sorted_hex_files(old_path, &total_read_files); + + write_items = 0; + for (int i = 0; i < total_read_files; i++) + { + de = all_dirents[i]; + + if ((!strcmp((char *) de->d_name, ".")) || !strcmp((char *) de->d_name, "..")) + { + continue; + } + + read_file = psprintf("%s/%s", (char *) old_path, (char *) de->d_name); + + old_fd = fopen(read_file, "rb"); + read_items = 0; + + do + { + memset(pg_buf.data, 0, BLCKSZ); + read_items = fread(pg_buf.data + write_offset, element_size, to_read_items, old_fd); /* how many items we read */ + + if (to_read_items == n_items) /* not finishing up a partial read */ + { + + ((PageHeader) pg_buf.data)->pd_lower = SizeOfPageHeaderData; + ((PageHeader) pg_buf.data)->pd_upper = BLCKSZ; + ((PageHeader) pg_buf.data)->pd_special = BLCKSZ; + + ((PageHeader) pg_buf.data)->pd_checksum = + pg_checksum_page(pg_buf.data, blkno); + + fwrite(pg_buf.data, SizeOfPageHeaderData, 1, new_fp); /* write page header data */ + } + + + write_items += fwrite(pg_buf.data + write_offset, element_size, read_items, new_fp); /* increment how many items written */ + + if (write_items % n_items == 0 && write_items > 0 && errno == 0) /* finished writing into new page. */ + { + if (write_items == n_items * SLRU_PAGES_PER_SEGMENT) /* end of segment */ + { + fclose(new_fp); + pages_written = 0; + write_file_segno ++; + write_items = 0; + write_file = psprintf("%s/%04X", new_path, write_file_segno); + new_fp = fopen((char *)write_file, "wb"); + res = init_new_slru_file(new_fp); + if (res < 0 || new_fp == NULL) + { + fclose(new_fp); + cleanup_dirents(all_dirents, total_read_files); + + } + } else { /* end of page but not segment */ + //fflush(new_fp); + pages_written ++; + } + + blkno++; + + to_read_items = n_items; /* to_read_items = n */ + res = fseek(new_fp, (pages_written * BLCKSZ), SEEK_SET); + if (res < 0) + { + fclose(new_fp); + cleanup_dirents(all_dirents, total_read_files); + return -1; + } + } else { + /*end of segment in source dir*/ + if (read_items < to_read_items) + { + to_read_items = n_items - read_items; /* read remaining items */ + } + } + } while (!feof(old_fd)); /* until end of file */ + fclose(old_fd); + } + fflush(new_fp); + + /* free memory malloc'd by scandir while sorting */ + cleanup_dirents(all_dirents, total_read_files); + + return 0; +} + void check_file_clone(void) { diff --git a/src/bin/pg_upgrade/function.c b/src/bin/pg_upgrade/function.c index 93d975864ba..9141c30b165 100644 --- a/src/bin/pg_upgrade/function.c +++ b/src/bin/pg_upgrade/function.c @@ -7,6 +7,7 @@ * src/bin/pg_upgrade/function.c */ + #include "postgres_fe.h" #include "access/transam.h" @@ -42,6 +43,43 @@ library_name_compare(const void *p1, const void *p2) ((const LibraryInfo *) p2)->dbnum; } +/* + * qsort comparator for hex filenames + */ +static int +file_name_compare(const struct dirent ** de_1, const struct dirent ** de_2) +{ + int n1; + int n2; + + + char * fname_1; + char * fname_2; + + fname_1 = (char *) (*de_1)->d_name; + fname_2 = (char *) (*de_2)->d_name; + + if ((strcmp(fname_1, ".")) || strcmp(fname_1, "..")) + { + n1 = (int) strtol(fname_1, NULL, 16); + } else { + n1 = -1; + } + + if ((strcmp(fname_2, ".")) || strcmp(fname_2, "..")) + { + n2 = (int) strtol(fname_2, NULL, 16); + } else { + n2 = -1; + } + + if (n1 == n2) + { + return 1; //arbitrarily select the first input + } else { + return n1 - n2; + } +} /* * get_loadable_libraries() @@ -109,6 +147,34 @@ get_loadable_libraries(void) } +/* + * get_sorted_hex_files() + * given the filepath of a directory, + * return array of child dirents with hex filenames e.g '000A' + * in sorted order + */ +struct dirent** +get_sorted_hex_files(char * dr, int * size) +{ + struct dirent **entry_list; + + *size = scandir(dr, &entry_list, NULL, file_name_compare); + if (*size < 0) + { + return NULL; //error + } + return entry_list; +} + +void +cleanup_dirents(struct dirent ** all_dirents, int total_read_files) +{ + for (int i = 0; i < total_read_files; i++) + { + free((struct dirent **) all_dirents[i]); + } + free((struct dirent **) all_dirents); +} /* * check_loadable_libraries() * diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index 115faa222e3..bf01b1df5e1 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -38,27 +38,35 @@ #include "postgres_fe.h" +#include #include +#include #ifdef HAVE_LANGINFO_H #include #endif +#include "access/slrudefs.h" #include "catalog/pg_class_d.h" #include "common/file_perm.h" #include "common/logging.h" #include "common/restricted_token.h" #include "fe_utils/string_utils.h" #include "pg_upgrade.h" +#include "storage/bufpage.h" static void prepare_new_cluster(void); static void prepare_new_globals(void); static void create_new_objects(void); static void copy_xact_xlog_xid(void); -static void set_frozenxids(bool minmxid_only); +static void set_frozenxids(bool minmxid_only);; static void make_outputdirs(char *pgdata); static void setup(char *argv0, bool *live_check); + +#define MAXBUFSIZE (BLCKSZ - SizeOfPageHeaderData) +#define SLRU_PAGES_PER_SEGMENT 32 + ClusterInfo old_cluster, new_cluster; OSInfo os_info; @@ -573,11 +581,36 @@ copy_xact_xlog_xid(void) * Copy old commit logs to new data dir. pg_clog has been renamed to * pg_xact in post-10 clusters. */ - copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? - "pg_clog" : "pg_xact", - GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ? - "pg_clog" : "pg_xact"); + + if (old_cluster.controldata.cat_ver <= CLOG_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver >= CLOG_FORMATCHANGE_CAT_VER) + { + int ret; + ret = copy_to_new_format(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? + "pg_clog" : "pg_xact", + "pg_xact", 1); + + if (ret < 0) + { + pg_fatal("could not reformat clog files"); + } + } + if (old_cluster.controldata.cat_ver >= CLOG_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver >= CLOG_FORMATCHANGE_CAT_VER) + { + copy_subdir_files("pg_xact", "pg_xact"); + } + + if (old_cluster.controldata.cat_ver <= CLOG_FORMATCHANGE_CAT_VER && + new_cluster.controldata.cat_ver <= CLOG_FORMATCHANGE_CAT_VER) + { + copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? + "pg_clog" : "pg_xact", + GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ? + "pg_clog" : "pg_xact"); + } + prep_status("Setting oldest XID for new cluster"); exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -f -u %u \"%s\"", @@ -633,23 +666,44 @@ copy_xact_xlog_xid(void) } else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) { + copy_to_new_format("pg_multixact/offsets", "pg_multixact/offsets", MULTIXACT_MEMBER_ENTRY_SIZE); + copy_to_new_format("pg_multixact/members", "pg_multixact/members", MULTIXACT_OFFSET_ENTRY_SIZE); + + + prep_status("Setting next multixact ID and offset for new cluster"); + /* + * we preserve all files and contents, so we must preserve both "next" + * counters here and the oldest multi present on system. + */ + exec_prog(UTILITY_LOG_FILE, NULL, true, true, + "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"", + new_cluster.bindir, + old_cluster.controldata.chkpnt_nxtmxoff, + old_cluster.controldata.chkpnt_nxtmulti, + old_cluster.controldata.chkpnt_oldstMulti, + new_cluster.pgdata); + check_ok(); + + } + /*else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER) + { + * * Remove offsets/0000 file created by initdb that no longer matches * the new multi-xid value. "members" starts at zero so no need to * remove it. - */ + * remove_new_subdir("pg_multixact/offsets", false); prep_status("Setting oldest multixact ID in new cluster"); - - /* + * * We don't preserve files in this case, but it's important that the * oldest multi is set to the latest value used by the old system, so * that multixact.c returns the empty set for multis that might be * present on disk. We set next multi to the value following that; it * might end up wrapped around (i.e. 0) if the old cluster had * next=MaxMultiXactId, but multixact.c can cope with that just fine. - */ + * exec_prog(UTILITY_LOG_FILE, NULL, true, true, "\"%s/pg_resetwal\" -m %u,%u \"%s\"", new_cluster.bindir, @@ -657,7 +711,8 @@ copy_xact_xlog_xid(void) old_cluster.controldata.chkpnt_nxtmulti, new_cluster.pgdata); check_ok(); - } + + }*/ /* now reset the wal archives in the new cluster */ prep_status("Resetting WAL archives"); @@ -669,7 +724,6 @@ copy_xact_xlog_xid(void) check_ok(); } - /* * set_frozenxids() * diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 31589b0fdc4..26bd56cc429 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -113,7 +114,16 @@ extern char *output_files[]; * version to this value. pg_upgrade behavior depends on whether old and new * server versions are both newer than this, or only the new one is. */ -#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231 + +/* + * page header format change + */ +#define MULTIXACT_FORMATCHANGE_CAT_VER 202209141 + +/* + * page header format change + */ +#define CLOG_FORMATCHANGE_CAT_VER 202210141 /* * large object chunk size added to pg_controldata, @@ -122,7 +132,7 @@ extern char *output_files[]; #define LARGE_OBJECT_SIZE_PG_CONTROL_VER 942 /* - * change in JSONB format during 9.4 beta + * addition of page header */ #define JSONB_FORMAT_CHANGE_CAT_VER 201409291 @@ -378,6 +388,9 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile, void check_file_clone(void); void check_hard_link(void); + +int copy_to_new_format(const char *old_subdir, const char *new_subdir, int element_size); + /* fopen_priv() is no longer different from fopen() */ #define fopen_priv(path, mode) fopen(path, mode) @@ -385,6 +398,8 @@ void check_hard_link(void); void get_loadable_libraries(void); void check_loadable_libraries(void); +void cleanup_dirents(struct dirent ** all_dirents, int total_read_files); +struct dirent** get_sorted_hex_files(char * dr, int * size); /* info.c */ diff --git a/src/common/relpath.c b/src/common/relpath.c index 1b6b620ce83..4ec0e36d556 100644 --- a/src/common/relpath.c +++ b/src/common/relpath.c @@ -22,6 +22,16 @@ #include "common/relpath.h" #include "storage/backendid.h" +/* + * SLRU ID to path mapping + */ +#define PG_SLRU(symname,name,path,synchronize) \ + path, + +static char *slru_dirs[] = +{ +#include "access/slrulist.h" +}; /* * Lookup table of fork name by fork number. @@ -129,7 +139,7 @@ GetDatabasePath(Oid dbOid, Oid spcOid) } /* - * GetRelationPath - construct path to a relation's file + * GetSMgrFilePath - construct path to a relation's file * * Result is a palloc'd string. * @@ -138,12 +148,27 @@ GetDatabasePath(Oid dbOid, Oid spcOid) * the trouble considering BackendId is just int anyway. */ char * -GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber, +GetSMgrFilePath(Oid dbOid, Oid spcOid, RelFileNumber relNumber, int backendId, ForkNumber forkNumber) { char *path; - if (spcOid == GLOBALTABLESPACE_OID) + if (spcOid == SLRU_SPC_OID) + { + if (dbOid >= lengthof(slru_dirs) || forkNumber != 0 || backendId != InvalidBackendId) + { +#ifndef FRONTEND + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid SLRU file locator %u/%u/%u/%u/%u", + spcOid, dbOid, relNumber, backendId, forkNumber))); +#else + return NULL; +#endif + } + path = psprintf("%s/%04X", slru_dirs[dbOid], relNumber); + } + else if (spcOid == GLOBALTABLESPACE_OID) { /* Shared system relations live in {datadir}/global */ Assert(dbOid == 0); diff --git a/src/include/access/clog.h b/src/include/access/clog.h index 543f2e2643a..af16a2afcb9 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -40,18 +40,12 @@ extern void TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, TransactionId *subxids, XidStatus status, XLogRecPtr lsn); extern XidStatus TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn); -extern Size CLOGShmemBuffers(void); -extern Size CLOGShmemSize(void); -extern void CLOGShmemInit(void); extern void BootStrapCLOG(void); extern void StartupCLOG(void); extern void TrimCLOG(void); -extern void CheckPointCLOG(void); extern void ExtendCLOG(TransactionId newestXact); extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid); -extern int clogsyncfiletag(const FileTag *ftag, char *path); - /* XLOG stuff */ #define CLOG_ZEROPAGE 0x00 #define CLOG_TRUNCATE 0x10 diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h index 7662f8e1a9c..4e986fc023d 100644 --- a/src/include/access/commit_ts.h +++ b/src/include/access/commit_ts.h @@ -27,7 +27,6 @@ extern bool TransactionIdGetCommitTsData(TransactionId xid, extern TransactionId GetLatestCommitTsData(TimestampTz *ts, RepOriginId *nodeid); -extern Size CommitTsShmemBuffers(void); extern Size CommitTsShmemSize(void); extern void CommitTsShmemInit(void); extern void BootStrapCommitTs(void); @@ -41,8 +40,6 @@ extern void SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact); extern void AdvanceOldestCommitTsXid(TransactionId oldestXact); -extern int committssyncfiletag(const FileTag *ftag, char *path); - /* XLOG stuff */ #define COMMIT_TS_ZEROPAGE 0x00 #define COMMIT_TS_TRUNCATE 0x10 diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 4cbe17de7bd..96f8323f4e5 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -118,9 +118,6 @@ extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2); extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2); -extern int multixactoffsetssyncfiletag(const FileTag *ftag, char *path); -extern int multixactmemberssyncfiletag(const FileTag *ftag, char *path); - extern void AtEOXact_MultiXact(void); extern void AtPrepare_MultiXact(void); extern void PostPrepare_MultiXact(TransactionId xid); diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 130c41c8632..04fc078e8c3 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * slru.h - * Simple LRU buffering for transaction status logfiles + * Buffering for transaction status logfiles * * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -14,9 +14,36 @@ #define SLRU_H #include "access/xlogdefs.h" +#include "catalog/pg_tablespace_d.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" #include "storage/lwlock.h" +#include "storage/smgr.h" #include "storage/sync.h" +/* Pseudo database ID used for SLRU data. */ +#define SLRU_SPC_ID 9 + +/* Pseudo database IDs used by each cache. */ +#define PG_SLRU(symname,name,path, synchronize) \ + symname, + +typedef enum SlruIds +{ +#include "access/slrulist.h" + SLRU_NEXT_ID +} SlruIds; +#undef PG_SLRU + +typedef bool (*SlruPagePrecedesFunction) (int, int); + +static inline RelFileLocator +SlruRelFileLocator(uint32 slru_db_id, uint32 segment_id) +{ + RelFileLocator rlocator = {SLRU_SPC_ID, slru_db_id, segment_id}; + return rlocator; +} + /* * Define SLRU segment size. A page is the same BLCKSZ as is used everywhere @@ -33,142 +60,40 @@ */ #define SLRU_PAGES_PER_SEGMENT 32 -/* - * Page status codes. Note that these do not include the "dirty" bit. - * page_dirty can be true only in the VALID or WRITE_IN_PROGRESS states; - * in the latter case it implies that the page has been re-dirtied since - * the write started. - */ -typedef enum -{ - SLRU_PAGE_EMPTY, /* buffer is not in use */ - SLRU_PAGE_READ_IN_PROGRESS, /* page is being read in */ - SLRU_PAGE_VALID, /* page is valid and not being written */ - SLRU_PAGE_WRITE_IN_PROGRESS /* page is being written out */ -} SlruPageStatus; - -/* - * Shared-memory state - */ -typedef struct SlruSharedData -{ - LWLock *ControlLock; - - /* Number of buffers managed by this SLRU structure */ - int num_slots; - - /* - * Arrays holding info for each buffer slot. Page number is undefined - * when status is EMPTY, as is page_lru_count. - */ - char **page_buffer; - SlruPageStatus *page_status; - bool *page_dirty; - int *page_number; - int *page_lru_count; - LWLockPadded *buffer_locks; - - /* - * Optional array of WAL flush LSNs associated with entries in the SLRU - * pages. If not zero/NULL, we must flush WAL before writing pages (true - * for pg_xact, false for multixact, pg_subtrans, pg_notify). group_lsn[] - * has lsn_groups_per_page entries per buffer slot, each containing the - * highest LSN known for a contiguous group of SLRU entries on that slot's - * page. - */ - XLogRecPtr *group_lsn; - int lsn_groups_per_page; - - /*---------- - * We mark a page "most recently used" by setting - * page_lru_count[slotno] = ++cur_lru_count; - * The oldest page is therefore the one with the highest value of - * cur_lru_count - page_lru_count[slotno] - * The counts will eventually wrap around, but this calculation still - * works as long as no page's age exceeds INT_MAX counts. - *---------- - */ - int cur_lru_count; - - /* - * latest_page_number is the page number of the current end of the log; - * this is not critical data, since we use it only to avoid swapping out - * the latest page. - */ - int latest_page_number; - - /* SLRU's index for statistics purposes (might not be unique) */ - int slru_stats_idx; -} SlruSharedData; - -typedef SlruSharedData *SlruShared; - -/* - * SlruCtlData is an unshared structure that points to the active information - * in shared memory. - */ -typedef struct SlruCtlData -{ - SlruShared shared; - - /* - * Which sync handler function to use when handing sync requests over to - * the checkpointer. SYNC_HANDLER_NONE to disable fsync (eg pg_notify). - */ - SyncRequestHandler sync_handler; - - /* - * Decide whether a page is "older" for truncation and as a hint for - * evicting pages in LRU order. Return true if every entry of the first - * argument is older than every entry of the second argument. Note that - * !PagePrecedes(a,b) && !PagePrecedes(b,a) need not imply a==b; it also - * arises when some entries are older and some are not. For SLRUs using - * SimpleLruTruncate(), this must use modular arithmetic. (For others, - * the behavior of this callback has no functional implications.) Use - * SlruPagePrecedesUnitTests() in SLRUs meeting its criteria. - */ - bool (*PagePrecedes) (int, int); - - /* - * Dir is set during SimpleLruInit and does not change thereafter. Since - * it's always the same, it doesn't need to be in shared memory. - */ - char Dir[64]; -} SlruCtlData; - -typedef SlruCtlData *SlruCtl; - - -extern Size SimpleLruShmemSize(int nslots, int nlsns); -extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, - LWLock *ctllock, const char *subdir, int tranche_id, - SyncRequestHandler sync_handler); -extern int SimpleLruZeroPage(SlruCtl ctl, int pageno); -extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, - TransactionId xid); -extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, - TransactionId xid); -extern void SimpleLruWritePage(SlruCtl ctl, int slotno); -extern void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied); #ifdef USE_ASSERT_CHECKING -extern void SlruPagePrecedesUnitTests(SlruCtl ctl, int per_page); +extern void SlruPagePrecedesUnitTests(SlruPagePrecedesFunction PagePrecedes, + int per_page); #else #define SlruPagePrecedesUnitTests(ctl, per_page) do {} while (0) #endif -extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage); -extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno); +extern void SimpleLruTruncate(int slru_id, SlruPagePrecedesFunction PagePrecedes, + int cutoffPage); +extern bool SimpleLruDoesPhysicalPageExist(int slru_id, int pageno); -typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage, +typedef bool (*SlruScanCallback) (int slru_id, + SlruPagePrecedesFunction PagePrecedes, + char *filename, int segpage, void *data); -extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data); -extern void SlruDeleteSegment(SlruCtl ctl, int segno); - -extern int SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path); +extern bool SlruScanDirectory(int slru_id, SlruPagePrecedesFunction PagePrecedes, + SlruScanCallback callback, void *data); +extern void SlruDeleteSegment(int slru_id, int segno); /* SlruScanDirectory public callbacks */ -extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, +extern bool SlruScanDirCbReportPresence(int slru_id, + SlruPagePrecedesFunction PagePrecedes, + char *filename, int segpage, void *data); -extern bool SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, +extern bool SlruScanDirCbDeleteAll(int slru_id, SlruPagePrecedesFunction PagePrecedes, + char *filename, int segpage, void *data); +/* Buffer access */ +extern Buffer ReadSlruBuffer(int slru_id, int pageno, ReadBufferMode mode); +extern Buffer ZeroSlruBuffer(int slru_id, int pageno); +extern bool ProbeSlruBuffer(int slru_id, int pageno); + +/* Interfaces use by stats view */ +extern Oid SlruRelIdByName(const char *name); +extern const char *SlruName(int slru_id); + #endif /* SLRU_H */ diff --git a/src/include/access/slrudefs.h b/src/include/access/slrudefs.h new file mode 100644 index 00000000000..49cd78d923d --- /dev/null +++ b/src/include/access/slrudefs.h @@ -0,0 +1,19 @@ +/*------------------------------------------------------------------------- + * + * slrudefs.h + * macros for accessing contents of "slru" pages + * + * + * Copyright (c) 2021-2022, PostgreSQL Global Development Group + * + * src/include/access/slrudefs.h + * + *-------------------------------------------------------------------------- + */ + +#define SLRU_PAGES_PER_SEGMENT 32 + +#define MULTIXACT_MEMBER_ENTRY_SIZE 20 + +#define MULTIXACT_OFFSET_ENTRY_SIZE 8 + diff --git a/src/include/access/slrulist.h b/src/include/access/slrulist.h new file mode 100644 index 00000000000..c1289a1326c --- /dev/null +++ b/src/include/access/slrulist.h @@ -0,0 +1,30 @@ +/*--------------------------------------------------------------------------- + * slrulist.h + * + * The SLRU list is kept in its own source file for possible + * use by automatic tools. The exact representation of a rmgr is determined + * by the PG_SLRU macro, which is not defined in this file; it can be + * defined by the caller for special purposes. + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/slrulist.h + *--------------------------------------------------------------------------- + */ + +/* there is deliberately not an #ifndef SLRULIST_H here */ + +/* + * List of SLRU entries. Note that order of entries defines the + * numerical values of each SLRU's ID, which is used in in-memory structus. + */ + +/* symbol name, textual name, path, synchronize */ +PG_SLRU(SLRU_CLOG_ID, "Xact", "pg_xact", true) +PG_SLRU(SLRU_SUBTRANS_ID, "Subtrans", "pg_subtrans", false) +PG_SLRU(SLRU_MULTIXACT_OFFSET_ID, "MultiXactOffset", "pg_multixact/offsets", true) +PG_SLRU(SLRU_MULTIXACT_MEMBER_ID, "MultiXactMember", "pg_multixact/members", true) +PG_SLRU(SLRU_COMMIT_TS_ID, "CommitTs", "pg_commit_ts", true) +PG_SLRU(SLRU_SERIAL_ID, "Serial", "pg_serial", false) +PG_SLRU(SLRU_NOTIFY_ID, "Notify", "pg_notify", false) diff --git a/src/include/access/subtrans.h b/src/include/access/subtrans.h index f94e116640b..4685a05bc92 100644 --- a/src/include/access/subtrans.h +++ b/src/include/access/subtrans.h @@ -18,11 +18,8 @@ extern void SubTransSetParent(TransactionId xid, TransactionId parent); extern TransactionId SubTransGetParent(TransactionId xid); extern TransactionId SubTransGetTopmostTransaction(TransactionId xid); -extern Size SUBTRANSShmemSize(void); -extern void SUBTRANSShmemInit(void); extern void BootStrapSUBTRANS(void); extern void StartupSUBTRANS(TransactionId oldestActiveXID); -extern void CheckPointSUBTRANS(void); extern void ExtendSUBTRANS(TransactionId newestXact); extern void TruncateSUBTRANS(TransactionId oldestXact); diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 501a434b904..5af804509ce 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,8 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202212092 + + +#define CATALOG_VERSION_NO 202212021 #endif diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h index 9964c312aa2..04114305547 100644 --- a/src/include/catalog/storage.h +++ b/src/include/catalog/storage.h @@ -22,19 +22,20 @@ /* GUC variables */ extern PGDLLIMPORT int wal_skip_threshold; -extern SMgrRelation RelationCreateStorage(RelFileLocator rlocator, - char relpersistence, - bool register_delete); +extern SMgrFileHandle RelationCreateStorage(RelFileLocator rlocator, + char relpersistence, + bool register_delete); extern void RelationDropStorage(Relation rel); extern void RelationPreserveStorage(RelFileLocator rlocator, bool atCommit); extern void RelationPreTruncate(Relation rel); extern void RelationTruncate(Relation rel, BlockNumber nblocks); -extern void RelationCopyStorage(SMgrRelation src, SMgrRelation dst, - ForkNumber forkNum, char relpersistence); +extern void RelationCopyStorage(SMgrFileHandle src, SMgrFileHandle dst, + char relpersistence); extern bool RelFileLocatorSkippingWAL(RelFileLocator rlocator); extern Size EstimatePendingSyncsSpace(void); extern void SerializePendingSyncs(Size maxSize, char *startAddress); extern void RestorePendingSyncs(char *startAddress); +extern void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo); /* * These functions used to be in storage/smgr/smgr.c, which explains the diff --git a/src/include/common/relpath.h b/src/include/common/relpath.h index 4bbd94393c8..606d0e8c64f 100644 --- a/src/include/common/relpath.h +++ b/src/include/common/relpath.h @@ -63,6 +63,9 @@ typedef enum ForkNumber #define FORKNAMECHARS 4 /* max chars for a fork name */ +/* Pseudo tablespace ID used for SLRUs. */ +#define SLRU_SPC_OID 9 + extern PGDLLIMPORT const char *const forkNames[]; extern ForkNumber forkname_to_number(const char *forkName); @@ -73,25 +76,25 @@ extern int forkname_chars(const char *str, ForkNumber *fork); */ extern char *GetDatabasePath(Oid dbOid, Oid spcOid); -extern char *GetRelationPath(Oid dbOid, Oid spcOid, RelFileNumber relNumber, +extern char *GetSMgrFilePath(Oid dbOid, Oid spcOid, RelFileNumber relNumber, int backendId, ForkNumber forkNumber); /* * Wrapper macros for GetRelationPath. Beware of multiple - * evaluation of the RelFileLocator or RelFileLocatorBackend argument! + * evaluation of the RelFileLocator or SMgrFileLocator argument! */ /* First argument is a RelFileLocator */ #define relpathbackend(rlocator, backend, forknum) \ - GetRelationPath((rlocator).dbOid, (rlocator).spcOid, (rlocator).relNumber, \ + GetSMgrFilePath((rlocator).dbOid, (rlocator).spcOid, (rlocator).relNumber, \ backend, forknum) /* First argument is a RelFileLocator */ #define relpathperm(rlocator, forknum) \ relpathbackend(rlocator, InvalidBackendId, forknum) -/* First argument is a RelFileLocatorBackend */ -#define relpath(rlocator, forknum) \ - relpathbackend((rlocator).locator, (rlocator).backend, forknum) +/* First argument is a SMgrFileLocator */ +#define smgrfilepath(slocator) \ + GetSMgrFilePath((slocator).locator.dbOid, (slocator).locator.spcOid, (slocator).locator.relNumber, (slocator).backend, (slocator).forknum) #endif /* RELPATH_H */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 406db6be783..e676692a30a 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -281,6 +281,11 @@ typedef union BufferDescPadded char pad[BUFFERDESC_PAD_TO_SIZE]; } BufferDescPadded; +#define BufferGetExternalLSN(bufHdr) \ + BufferExternalLSNs[(bufHdr)->buf_id] +#define BufferSetExternalLSN(bufHdr, lsn) \ + BufferExternalLSNs[(bufHdr)->buf_id] = (lsn) + /* * The PendingWriteback & WritebackContext structure are used to keep * information about pending flush requests to be issued to the OS. @@ -307,6 +312,7 @@ typedef struct WritebackContext /* in buf_init.c */ extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray; +extern PGDLLIMPORT XLogRecPtr *BufferExternalLSNs; extern PGDLLIMPORT WritebackContext BackendWritebackContext; /* in localbuf.c */ @@ -413,10 +419,9 @@ extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); /* localbuf.c */ -extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, - ForkNumber forkNum, +extern PrefetchBufferResult PrefetchLocalBuffer(SMgrFileHandle smgr, BlockNumber blockNum); -extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, +extern BufferDesc *LocalBufferAlloc(SMgrFileHandle smgr, BlockNumber blockNum, bool *foundPtr); extern void MarkLocalBufferDirty(Buffer buffer); extern void DropRelationLocalBuffers(RelFileLocator rlocator, diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index e1bd22441b0..4cddb59f500 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -42,8 +42,11 @@ typedef enum RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page * in "cleanup" mode */ RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */ - RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL + RBM_NORMAL_NO_LOG, /* Don't log page as invalid during WAL * replay; otherwise same as RBM_NORMAL */ + + RBM_TRIM /*Read for TRIM functions in CLOG / MultiXact. + Don't validate checksum or zero. */ } ReadBufferMode; /* @@ -59,7 +62,7 @@ typedef struct PrefetchBufferResult struct WritebackContext; /* forward declared, to avoid including smgr.h here */ -struct SMgrRelationData; +struct SMgrFileData; /* in globals.c ... this duplicates miscadmin.h */ extern PGDLLIMPORT int NBuffers; @@ -110,8 +113,7 @@ extern PGDLLIMPORT int32 *LocalRefCount; /* * prototypes for functions in bufmgr.c */ -extern PrefetchBufferResult PrefetchSharedBuffer(struct SMgrRelationData *smgr_reln, - ForkNumber forkNum, +extern PrefetchBufferResult PrefetchSharedBuffer(struct SMgrFileData *smgr_file, BlockNumber blockNum); extern PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum); @@ -125,13 +127,19 @@ extern Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent); +extern Buffer ReadBufferWithoutRelcacheWithHit(RelFileLocator rlocator, + ForkNumber forkNum, BlockNumber blockNum, + ReadBufferMode mode, BufferAccessStrategy strategy, + bool permanent, bool *hit); extern void ReleaseBuffer(Buffer buffer); extern void UnlockReleaseBuffer(Buffer buffer); extern void MarkBufferDirty(Buffer buffer); extern void IncrBufferRefCount(Buffer buffer); extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum); - +extern bool BufferProbe(RelFileLocator rlocator, ForkNumber forkNum, + BlockNumber blockNum); + extern void InitBufferPoolAccess(void); extern void AtEOXact_Buffers(bool isCommit); extern void PrintBufferLeakWarning(Buffer buffer); @@ -139,18 +147,26 @@ extern void CheckPointBuffers(int flags); extern BlockNumber BufferGetBlockNumber(Buffer buffer); extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum); -extern void FlushOneBuffer(Buffer buffer); -extern void FlushRelationBuffers(Relation rel); -extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels); extern void CreateAndCopyRelationData(RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent); + +extern void FlushOneBuffer(Buffer buffer); +extern void FlushRelationBuffers(Relation rel); +extern void FlushRelationsAllBuffers(RelFileLocator *locators, int nlocators); extern void FlushDatabaseBuffers(Oid dbid); -extern void DropRelationBuffers(struct SMgrRelationData *smgr_reln, + +extern void DropRelationBuffers(RelFileLocator rlocator, BackendId backend, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock); -extern void DropRelationsAllBuffers(struct SMgrRelationData **smgr_reln, - int nlocators); + +typedef struct RelFileLocatorBackend +{ + RelFileLocator locator; + BackendId backend; +} RelFileLocatorBackend; + +extern void DropRelationsAllBuffers(RelFileLocatorBackend *locators, int nlocators); extern void DropDatabaseBuffers(Oid dbid); #define RelationGetNumberOfBlocks(reln) \ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 2708c4b683d..3871a386d4b 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -201,6 +201,7 @@ typedef PageHeaderData *PageHeader; * handling pages. */ #define PG_PAGE_LAYOUT_VERSION 4 +#define PG_METAPAGE_LAYOUT_VERSION 1 #define PG_DATA_CHECKSUM_VERSION 1 /* ---------------------------------------------------------------- @@ -302,6 +303,20 @@ PageSetPageSizeAndVersion(Page page, Size size, uint8 version) ((PageHeader) page)->pd_pagesize_version = size | version; } +/* + * PageSetHeaderDataMinimal + * Sets the LSN, page size and version, and checksum + */ +#define PageSetHeaderDataNonRel(page, pageno, lsn, size, version) \ +( \ + PageSetLSN(page, lsn), \ + PageSetPageSizeAndVersion(page, size, version), \ + PageClearHasFreeLinePointers(page), \ + PageSetChecksumInplace(page, pageno) \ +) + + + /* ---------------- * page special data functions * ---------------- @@ -486,6 +501,8 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)), "BLCKSZ has to be a multiple of sizeof(size_t)"); extern void PageInit(Page page, Size pageSize, Size specialSize); +extern void PageInitSLRU(Page page, Size pageSize, Size specialSize); + extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags); extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size, OffsetNumber offsetNumber, int flags); diff --git a/src/include/storage/md.h b/src/include/storage/md.h index 10aa1b0109b..bcb87d56295 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -21,28 +21,26 @@ /* md storage manager functionality */ extern void mdinit(void); -extern void mdopen(SMgrRelation reln); -extern void mdclose(SMgrRelation reln, ForkNumber forknum); -extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); -extern bool mdexists(SMgrRelation reln, ForkNumber forknum); -extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo); -extern void mdextend(SMgrRelation reln, ForkNumber forknum, +extern void mdopen(SMgrFileHandle sfile); +extern void mdclose(SMgrFileHandle sfile); +extern void mdcreate(SMgrFileHandle sfile, bool isRedo); +extern bool mdexists(SMgrFileHandle sfile); +extern void mdunlink(SMgrFileLocator slocator, bool isRedo); +extern void mdextend(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer, bool skipFsync); -extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum, +extern bool mdprefetch(SMgrFileHandle sfile, BlockNumber blocknum); -extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, +extern void mdread(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer); -extern void mdwrite(SMgrRelation reln, ForkNumber forknum, +extern void mdwrite(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer, bool skipFsync); -extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, +extern void mdwriteback(SMgrFileHandle sfile, BlockNumber blocknum, BlockNumber nblocks); -extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); -extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); -extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); +extern BlockNumber mdnblocks(SMgrFileHandle sfile); +extern void mdtruncate(SMgrFileHandle sfile, BlockNumber nblocks); +extern void mdimmedsync(SMgrFileHandle sfile); extern void ForgetDatabaseSyncRequests(Oid dbid); -extern void DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo); /* md sync callbacks */ extern int mdsyncfiletag(const FileTag *ftag, char *path); diff --git a/src/include/storage/relfilelocator.h b/src/include/storage/relfilelocator.h index 10f41f3abb3..296cb1b8e51 100644 --- a/src/include/storage/relfilelocator.h +++ b/src/include/storage/relfilelocator.h @@ -53,6 +53,8 @@ * Note: various places use RelFileLocator in hashtable keys. Therefore, * there *must not* be any unused padding bytes in this struct. That * should be safe as long as all the fields are of type Oid. + * + * See also SMgrFileLocator in smgr.h. */ typedef struct RelFileLocator { @@ -62,38 +64,15 @@ typedef struct RelFileLocator } RelFileLocator; /* - * Augmenting a relfilelocator with the backend ID provides all the information - * we need to locate the physical storage. The backend ID is InvalidBackendId - * for regular relations (those accessible to more than one backend), or the - * owning backend's ID for backend-local relations. Backend-local relations - * are always transient and removed in case of a database crash; they are - * never WAL-logged or fsync'd. - */ -typedef struct RelFileLocatorBackend -{ - RelFileLocator locator; - BackendId backend; -} RelFileLocatorBackend; - -#define RelFileLocatorBackendIsTemp(rlocator) \ - ((rlocator).backend != InvalidBackendId) - -/* - * Note: RelFileLocatorEquals and RelFileLocatorBackendEquals compare relNumber + * Note: RelFileLocatorEquals compares relNumber * first since that is most likely to be different in two unequal * RelFileLocators. It is probably redundant to compare spcOid if the other * fields are found equal, but do it anyway to be sure. Likewise for checking - * the backend ID in RelFileLocatorBackendEquals. + * the backend ID in SMgrFileLocatorBackendEquals. */ #define RelFileLocatorEquals(locator1, locator2) \ ((locator1).relNumber == (locator2).relNumber && \ (locator1).dbOid == (locator2).dbOid && \ (locator1).spcOid == (locator2).spcOid) -#define RelFileLocatorBackendEquals(locator1, locator2) \ - ((locator1).locator.relNumber == (locator2).locator.relNumber && \ - (locator1).locator.dbOid == (locator2).locator.dbOid && \ - (locator1).backend == (locator2).backend && \ - (locator1).locator.spcOid == (locator2).locator.spcOid) - #endif /* RELFILELOCATOR_H */ diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a07715356ba..0ed569b2836 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -18,31 +18,56 @@ #include "storage/block.h" #include "storage/relfilelocator.h" + /* - * smgr.c maintains a table of SMgrRelation objects, which are essentially - * cached file handles. An SMgrRelation is created (if not already present) + * SMgrFileLocator contains all the information needed to locate the physical + * storage of a relation fork, or some other file that is managed by the buffer + * manager. + * + * The backend ID is InvalidBackendId for regular relations (those accessible + * to more than one backend), or the owning backend's ID for backend-local + * relations. Backend-local relations are always transient and removed in + * case of a database crash; they are never WAL-logged or fsync'd. + */ +typedef struct SMgrFileLocator +{ + RelFileLocator locator; + BackendId backend; + ForkNumber forknum; +} SMgrFileLocator; + +#define SMgrFileLocatorIsTemp(slocator) \ + ((slocator).backend != InvalidBackendId) + +/* + * smgr.c maintains a table of SMgrFileData objects, which are essentially + * cached file handles. An SMgrFile is created (if not already present) * by smgropen(), and destroyed by smgrclose(). Note that neither of these * operations imply I/O, they just create or destroy a hashtable entry. * (But smgrclose() may release associated resources, such as OS-level file * descriptors.) * - * An SMgrRelation may have an "owner", which is just a pointer to it from - * somewhere else; smgr.c will clear this pointer if the SMgrRelation is + * An SMgrFile may have an "owner", which is just a pointer to it from + * somewhere else; smgr.c will clear this pointer if the SMgrFile is * closed. We use this to avoid dangling pointers from relcache to smgr * without having to make the smgr explicitly aware of relcache. There * can't be more than one "owner" pointer per SMgrRelation, but that's * all we need. * - * SMgrRelations that do not have an "owner" are considered to be transient, + * SMgrFiles that do not have an "owner" are considered to be transient, * and are deleted at end of transaction. + * + * A file that is represented by an SMgrFile can be managed by the buffer + * manager. Currently, it's only used for relation files, but could be used + * for SLRUs and other things in the future. */ -typedef struct SMgrRelationData +typedef struct SMgrFileData { - /* rlocator is the hashtable lookup key, so it must be first! */ - RelFileLocatorBackend smgr_rlocator; /* relation physical identifier */ + /* locator is the hashtable lookup key, so must be first! */ + SMgrFileLocator smgr_locator; /* file physical identifier */ /* pointer to owning pointer, or NULL if none */ - struct SMgrRelationData **smgr_owner; + struct SMgrFileData **smgr_owner; /* * The following fields are reset to InvalidBlockNumber upon a cache flush @@ -51,7 +76,7 @@ typedef struct SMgrRelationData * invalidation for fork extension. */ BlockNumber smgr_targblock; /* current insertion target block */ - BlockNumber smgr_cached_nblocks[MAX_FORKNUM + 1]; /* last known size */ + BlockNumber smgr_cached_nblocks; /* last known size */ /* additional public fields may someday exist here */ @@ -65,46 +90,46 @@ typedef struct SMgrRelationData * for md.c; per-fork arrays of the number of open segments * (md_num_open_segs) and the segments themselves (md_seg_fds). */ - int md_num_open_segs[MAX_FORKNUM + 1]; - struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; + int md_num_open_segs; + struct _MdfdVec *md_seg_fds; - /* if unowned, list link in list of all unowned SMgrRelations */ + /* if unowned, list link in list of all unowned SMgrFiles */ dlist_node node; -} SMgrRelationData; +} SMgrFileData; -typedef SMgrRelationData *SMgrRelation; +typedef SMgrFileData *SMgrFileHandle; #define SmgrIsTemp(smgr) \ - RelFileLocatorBackendIsTemp((smgr)->smgr_rlocator) + SMgrFileLocatorIsTemp((smgr)->smgr_locator) extern void smgrinit(void); -extern SMgrRelation smgropen(RelFileLocator rlocator, BackendId backend); -extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); -extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln); -extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln); -extern void smgrclose(SMgrRelation reln); +extern SMgrFileHandle smgropen(RelFileLocator rlocator, BackendId backend, ForkNumber forkNum); +extern bool smgrexists(SMgrFileHandle sfile); +extern void smgrsetowner(SMgrFileHandle *owner, SMgrFileHandle sfile); +extern void smgrclearowner(SMgrFileHandle *owner, SMgrFileHandle sfile); +extern void smgrclose(SMgrFileHandle sfile); extern void smgrcloseall(void); -extern void smgrcloserellocator(RelFileLocatorBackend rlocator); -extern void smgrrelease(SMgrRelation reln); extern void smgrreleaseall(void); -extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); -extern void smgrdosyncall(SMgrRelation *rels, int nrels); -extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo); -extern void smgrextend(SMgrRelation reln, ForkNumber forknum, +extern void smgrcreate(SMgrFileHandle sfile, bool isRedo); +extern void smgrextend(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer, bool skipFsync); -extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); -extern void smgrread(SMgrRelation reln, ForkNumber forknum, +extern bool smgrprefetch(SMgrFileHandle sfile, BlockNumber blocknum); +extern void smgrread(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer); -extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, +extern void smgrwrite(SMgrFileHandle sfile, BlockNumber blocknum, char *buffer, bool skipFsync); -extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum, +extern void smgrwriteback(SMgrFileHandle sfile, BlockNumber blocknum, BlockNumber nblocks); -extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); -extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum); -extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, - int nforks, BlockNumber *nblocks); -extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); +extern BlockNumber smgrnblocks(SMgrFileHandle sfile); +extern BlockNumber smgrnblocks_cached(SMgrFileHandle sfile); +extern void smgrimmedsync(SMgrFileHandle sfile); +extern void smgrunlink(SMgrFileHandle sfile, bool isRedo); + +extern void smgrtruncate_multi(RelFileLocator rlocator, BackendId backend, ForkNumber *forks, int nforks, BlockNumber *nblocks); +extern void smgrunlink_multi(RelFileLocator rlocator, BackendId backend, ForkNumber *forks, int nforks, bool isRedo); + +extern void smgrcloserellocator(RelFileLocator rlocator, BackendId backend); + extern void AtEOXact_SMgr(void); extern bool ProcessBarrierSmgrRelease(void); diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h index 23748b72caf..c494c01056f 100644 --- a/src/include/utils/inval.h +++ b/src/include/utils/inval.h @@ -48,7 +48,7 @@ extern void CacheInvalidateRelcacheByTuple(HeapTuple classTuple); extern void CacheInvalidateRelcacheByRelid(Oid relid); -extern void CacheInvalidateSmgr(RelFileLocatorBackend rlocator); +extern void CacheInvalidateSmgr(RelFileLocator rlocator, BackendId backend); extern void CacheInvalidateRelmap(Oid databaseId); diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index f383a2fca9e..290838ab8c5 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -54,7 +54,7 @@ typedef LockInfoData *LockInfo; typedef struct RelationData { RelFileLocator rd_locator; /* relation physical identifier */ - SMgrRelation rd_smgr; /* cached file handle, or NULL */ + SMgrFileHandle rd_smgr[MAX_FORKNUM + 1]; /* cached file handles, or NULLs */ int rd_refcnt; /* reference count */ BackendId rd_backend; /* owning backend id, if temporary relation */ bool rd_islocaltemp; /* rel is a temp rel of this session */ @@ -562,16 +562,17 @@ typedef struct ViewOptions * Note: since a relcache flush can cause the file handle to be closed again, * it's unwise to hold onto the pointer returned by this function for any * long period. Recommended practice is to just re-execute RelationGetSmgr - * each time you need to access the SMgrRelation. It's quite cheap in + * each time you need to access the SMgrFileHandle. It's quite cheap in * comparison to whatever an smgr function is going to do. */ -static inline SMgrRelation -RelationGetSmgr(Relation rel) +static inline SMgrFileHandle +RelationGetSmgr(Relation rel, ForkNumber forkNum) { - if (unlikely(rel->rd_smgr == NULL)) - smgrsetowner(&(rel->rd_smgr), smgropen(rel->rd_locator, rel->rd_backend)); - return rel->rd_smgr; + if (unlikely(rel->rd_smgr[forkNum] == NULL)) + smgrsetowner(&(rel->rd_smgr[forkNum]), smgropen(rel->rd_locator, rel->rd_backend, forkNum)); + return rel->rd_smgr[forkNum]; } +#endif /* * RelationCloseSmgr @@ -580,13 +581,15 @@ RelationGetSmgr(Relation rel) static inline void RelationCloseSmgr(Relation relation) { - if (relation->rd_smgr != NULL) - smgrclose(relation->rd_smgr); - - /* smgrclose should unhook from owner pointer */ - Assert(relation->rd_smgr == NULL); + for (int i = 0; i <= MAX_FORKNUM; i++) + { + if (relation->rd_smgr[i] != NULL) + { + smgrclose(relation->rd_smgr[i]); + Assert(relation->rd_smgr[i] == NULL); + } + } } -#endif /* !FRONTEND */ /* * RelationGetTargetBlock @@ -597,7 +600,7 @@ RelationCloseSmgr(Relation relation) * so there's no need to re-open the smgr handle if it's not currently open. */ #define RelationGetTargetBlock(relation) \ - ( (relation)->rd_smgr != NULL ? (relation)->rd_smgr->smgr_targblock : InvalidBlockNumber ) + ( (relation)->rd_smgr[MAIN_FORKNUM] != NULL ? (relation)->rd_smgr[MAIN_FORKNUM]->smgr_targblock : InvalidBlockNumber ) /* * RelationSetTargetBlock @@ -605,7 +608,7 @@ RelationCloseSmgr(Relation relation) */ #define RelationSetTargetBlock(relation, targblock) \ do { \ - RelationGetSmgr(relation)->smgr_targblock = (targblock); \ + RelationGetSmgr(relation, MAIN_FORKNUM)->smgr_targblock = (targblock); \ } while (0) /* -- 2.38.1