Proposal to add page headers to SLRU pages
Hi all,
PostgreSQL currently maintains several data structures in the SLRU cache. The current SLRU pages do not have any header, so it is impossible to checksum a page and verify its integrity. It is very difficult to debug issues caused by corrupted SLRU pages. Also, without a page header, page LSN is tracked in an ad-hoc fashion using LSN groups, which requires additional data structure in the shared memory. At eBay, we are building on the patch shared by Rishu Bagga in [1]/messages/by-id/EFAAC0BE-27E9-4186-B925-79B7C696D5AC@amazon.com, which adds the standard PageHeaderData to each SLRU page. We believe that adding the standard page header to each SLRU page is the correct approach for the long run. It adds a checksum to each SLRU page, tracks page LSN as if it is a standard page and eases future page enhancements.
The enclosed patch changes the address calculation logic for all 7 SLRUs in the following 6 files:
src/backend/access/transam/clog.c
src/backend/access/transam/commit_ts.c
src/backend/access/transam/multixact.c
src/backend/access/transam/subtrans.c
src/backend/commands/async.c
src/backend/storage/lmgr/predicate.c
The patch enables page checksum with changes to the following 2 files:
src/backend/access/transam/slru.c
src/bin/pg_checksums/pg_checksums.c
The patch removes the group LSNs defined for each SLRU cache. See changes to:
src/include/access/slru.h
The patch adds a few helper macros in the following files:
src/backend/storage/page/bufpage.c
src/include/storage/bufpage.h
The patch updates some test cases:
src/bin/pg_resetwal/t/001_basic.pl
src/test/modules/test_slru/test_slru.c
I am still working on patching the pg_upgrade. Just love to hear your thoughts on the idea and the current patch.
Discussed with: Anton Shyrabokau and Shawn Debnath
[1]: /messages/by-id/EFAAC0BE-27E9-4186-B925-79B7C696D5AC@amazon.com
Regards,
Yong
Attachments:
slru_add_page_header.patchapplication/octet-stream; name=slru_add_page_header.patchDownload
src/backend/access/transam/clog.c | 52 ++++++++-------
src/backend/access/transam/commit_ts.c | 26 +++++---
src/backend/access/transam/multixact.c | 59 ++++++++++-------
src/backend/access/transam/slru.c | 113 +++++++++++----------------------
src/backend/access/transam/subtrans.c | 12 ++--
src/backend/commands/async.c | 27 ++++----
src/backend/storage/lmgr/predicate.c | 16 +++--
src/backend/storage/page/bufpage.c | 25 ++++++++
src/bin/pg_checksums/pg_checksums.c | 9 +++
src/bin/pg_resetwal/t/001_basic.pl | 6 +-
src/include/access/slru.h | 15 +----
src/include/storage/bufpage.h | 7 ++
src/test/modules/test_slru/test_slru.c | 13 ++--
13 files changed, 197 insertions(+), 183 deletions(-)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index cc60eab1e2..639bb90a1c 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -41,6 +41,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/proc.h"
#include "storage/sync.h"
@@ -59,7 +60,7 @@
/* We need two bits per xact, so four xacts fit in a byte */
#define CLOG_BITS_PER_XACT 2
#define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACTS_PER_PAGE (CapacityOfPageContents * CLOG_XACTS_PER_BYTE)
#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
@@ -77,13 +78,6 @@ TransactionIdToPage(TransactionId xid)
#define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
#define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
-/* We store the latest async LSN for each group of transactions */
-#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
-#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
-
-#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
- ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
-
/*
* The number of subtransactions below which we consider to apply clog group
* update optimization. Testing reveals that the number higher than this can
@@ -101,7 +95,7 @@ static SlruCtlData XactCtlData;
static int ZeroCLOGPage(int64 pageno, bool writeXlog);
static bool CLOGPagePrecedes(int64 page1, int64 page2);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
Oid oldestXactDb);
static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
@@ -583,8 +577,9 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
char *byteptr;
char byteval;
char curval;
+ Page page = XactCtl->shared->page_buffer[slotno];
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(page) + byteno;
curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
/*
@@ -613,7 +608,7 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*byteptr = byteval;
/*
- * Update the group LSN if the transaction completion LSN is higher.
+ * Update the page LSN if the transaction completion LSN is higher.
*
* Note: lsn will be invalid when supplied during InRecovery processing,
* so we don't need to do anything special to avoid LSN updates during
@@ -622,10 +617,8 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*/
if (!XLogRecPtrIsInvalid(lsn))
{
- int lsnindex = GetLSNIndex(slotno, xid);
-
- if (XactCtl->shared->group_lsn[lsnindex] < lsn)
- XactCtl->shared->group_lsn[lsnindex] = lsn;
+ if (PageGetLSN(page) < lsn)
+ PageSetLSN(page, lsn);
}
}
@@ -651,19 +644,19 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
int byteno = TransactionIdToByte(xid);
int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
int slotno;
- int lsnindex;
+ Page page;
char *byteptr;
XidStatus status;
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
- lsnindex = GetLSNIndex(slotno, xid);
- *lsn = XactCtl->shared->group_lsn[lsnindex];
+ *lsn = PageGetLSN(page);
LWLockRelease(XactSLRULock);
@@ -698,14 +691,14 @@ CLOGShmemBuffers(void)
Size
CLOGShmemSize(void)
{
- return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
+ return SimpleLruShmemSize(CLOGShmemBuffers());
}
void
CLOGShmemInit(void)
{
XactCtl->PagePrecedes = CLOGPagePrecedes;
- SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
+ SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(),
XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
SYNC_HANDLER_CLOG, false);
SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
@@ -747,11 +740,17 @@ static int
ZeroCLOGPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(XactCtl, pageno);
+ page = XactCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -807,12 +806,12 @@ TrimCLOG(void)
char *byteptr;
slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(XactCtl->shared->page_buffer[slotno]) + byteno;
/* Zero so-far-unused positions in the current byte */
*byteptr &= (1 << bshift) - 1;
/* Zero the rest of the page */
- MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+ MemSet(byteptr + 1, 0, CapacityOfPageContents - byteno - 1);
XactCtl->shared->page_dirty[slotno] = true;
}
@@ -836,7 +835,6 @@ CheckPointCLOG(void)
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that CLOG has room for a newly-allocated XID.
*
@@ -958,12 +956,12 @@ CLOGPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
+ return XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 7c642f7b59..3f01f10df5 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -31,6 +31,7 @@
#include "funcapi.h"
#include "miscadmin.h"
#include "pg_trace.h"
+#include "storage/bufpage.h"
#include "storage/shmem.h"
#include "utils/builtins.h"
#include "utils/snapmgr.h"
@@ -63,7 +64,7 @@ typedef struct CommitTimestampEntry
sizeof(RepOriginId))
#define COMMIT_TS_XACTS_PER_PAGE \
- (BLCKSZ / SizeOfCommitTimestampEntry)
+ (CapacityOfPageContents / SizeOfCommitTimestampEntry)
/*
@@ -120,7 +121,7 @@ static int ZeroCommitTsPage(int64 pageno, bool writeXlog);
static bool CommitTsPagePrecedes(int64 page1, int64 page2);
static void ActivateCommitTs(void);
static void DeactivateCommitTs(void);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid);
/*
@@ -254,11 +255,12 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
CommitTimestampEntry entry;
Assert(TransactionIdIsNormal(xid));
+ Assert(xid == slotno * COMMIT_TS_XACTS_PER_PAGE + entryno);
entry.time = ts;
entry.nodeid = nodeid;
- memcpy(CommitTsCtl->shared->page_buffer[slotno] +
+ memcpy(PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
&entry, SizeOfCommitTimestampEntry);
}
@@ -337,7 +339,7 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
memcpy(&entry,
- CommitTsCtl->shared->page_buffer[slotno] +
+ PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
SizeOfCommitTimestampEntry);
@@ -515,7 +517,7 @@ CommitTsShmemBuffers(void)
Size
CommitTsShmemSize(void)
{
- return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
+ return SimpleLruShmemSize(CommitTsShmemBuffers()) +
sizeof(CommitTimestampShared);
}
@@ -529,7 +531,7 @@ CommitTsShmemInit(void)
bool found;
CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
- SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0,
+ SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(),
CommitTsSLRULock, "pg_commit_ts",
LWTRANCHE_COMMITTS_BUFFER,
SYNC_HANDLER_COMMIT_TS,
@@ -582,11 +584,17 @@ static int
ZeroCommitTsPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+ page = CommitTsCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -946,12 +954,12 @@ CommitTsPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
+ return XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index db3423f12e..edc56a763e 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -83,6 +83,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "postmaster/autovacuum.h"
+#include "storage/bufpage.h"
#include "storage/lmgr.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
@@ -106,7 +107,7 @@
*/
/* We need four bytes per offset */
-#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+#define MULTIXACT_OFFSETS_PER_PAGE (CapacityOfPageContents / sizeof(MultiXactOffset))
#define MultiXactIdToOffsetPage(xid) \
((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
@@ -138,7 +139,7 @@
/* size in bytes of a complete group */
#define MULTIXACT_MEMBERGROUP_SIZE \
(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
-#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (CapacityOfPageContents / MULTIXACT_MEMBERGROUP_SIZE)
#define MULTIXACT_MEMBERS_PER_PAGE \
(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
@@ -366,7 +367,7 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
MultiXactOffset start, uint32 distance);
static bool SetOffsetVacuumLimit(bool is_startup);
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
-static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
+static XLogRecPtr WriteMZeroPageXlogRec(int64 pageno, uint8 info);
static void WriteMTruncateXlogRec(Oid oldestMultiDB,
MultiXactId startTruncOff,
MultiXactId endTruncOff,
@@ -884,7 +885,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
* take the trouble to generalize the slru.c error reporting code.
*/
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
*offptr = offset;
@@ -921,12 +922,12 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
}
memberptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
*memberptr = members[i].xid;
flagsptr = (uint32 *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
flagsval = *flagsptr;
flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
@@ -1348,7 +1349,7 @@ retry:
entryno = MultiXactIdToOffsetEntry(multi);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
@@ -1381,7 +1382,7 @@ retry:
if (pageno != prev_pageno)
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
nextMXOffset = *offptr;
@@ -1424,7 +1425,7 @@ retry:
}
xactptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
if (!TransactionIdIsValid(*xactptr))
{
@@ -1435,7 +1436,7 @@ retry:
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
- flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ flagsptr = (uint32 *) (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
ptr[truelength].xid = *xactptr;
ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
@@ -1834,8 +1835,8 @@ MultiXactShmemSize(void)
mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
size = SHARED_MULTIXACT_STATE_SIZE;
- size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0));
- size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS));
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS));
return size;
}
@@ -1851,14 +1852,14 @@ MultiXactShmemInit(void)
MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
SimpleLruInit(MultiXactOffsetCtl,
- "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
+ "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS,
MultiXactOffsetSLRULock, "pg_multixact/offsets",
LWTRANCHE_MULTIXACTOFFSET_BUFFER,
SYNC_HANDLER_MULTIXACT_OFFSET,
false);
SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
SimpleLruInit(MultiXactMemberCtl,
- "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
+ "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS,
MultiXactMemberSLRULock, "pg_multixact/members",
LWTRANCHE_MULTIXACTMEMBER_BUFFER,
SYNC_HANDLER_MULTIXACT_MEMBER,
@@ -1933,11 +1934,17 @@ static int
ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+ page = MultiXactOffsetCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -1949,11 +1956,17 @@ static int
ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
+ page = MultiXactMemberCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2071,10 +2084,10 @@ TrimMultiXact(void)
MultiXactOffset *offptr;
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
- MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+ MemSet(offptr, 0, CapacityOfPageContents - (entryno * sizeof(MultiXactOffset)));
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
}
@@ -2104,9 +2117,9 @@ TrimMultiXact(void)
memberoff = MXOffsetToMemberOffset(offset);
slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
xidptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
- MemSet(xidptr, 0, BLCKSZ - memberoff);
+ MemSet(xidptr, 0, CapacityOfPageContents - memberoff);
/*
* Note: we don't need to zero out the flag bits in the remaining
@@ -2758,7 +2771,7 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
LWLockRelease(MultiXactOffsetSLRULock);
@@ -3192,12 +3205,12 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
* Write an xlog record reflecting the zeroing of either a MEMBERs or
* OFFSETs page (info shows which)
*/
-static void
+static XLogRecPtr
WriteMZeroPageXlogRec(int64 pageno, uint8 info)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_MULTIXACT_ID, info);
+ return XLogInsert(RM_MULTIXACT_ID, info);
}
/*
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 7a371d9034..d7f3deea7d 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -57,6 +57,7 @@
#include "access/xlogutils.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/shmem.h"
@@ -154,13 +155,13 @@ typedef enum
SLRU_WRITE_FAILED,
SLRU_FSYNC_FAILED,
SLRU_CLOSE_FAILED,
+ SLRU_DATA_CORRUPTED,
} SlruErrorCause;
static SlruErrorCause slru_errcause;
static int slru_errno;
-static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
@@ -179,7 +180,7 @@ static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
*/
Size
-SimpleLruShmemSize(int nslots, int nlsns)
+SimpleLruShmemSize(int nslots)
{
Size sz;
@@ -192,9 +193,6 @@ SimpleLruShmemSize(int nslots, int nlsns)
sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
- if (nlsns > 0)
- sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
-
return BUFFERALIGN(sz) + BLCKSZ * nslots;
}
@@ -204,14 +202,13 @@ SimpleLruShmemSize(int nslots, int nlsns)
* ctl: address of local (unshared) control structure.
* name: name of SLRU. (This is user-visible, pick with care!)
* nslots: number of page slots to use.
- * nlsns: number of LSN groups per page (set to zero if not relevant).
* ctllock: LWLock to use to control access to the shared control structure.
* subdir: PGDATA-relative subdirectory that will contain the files.
* tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks.
* sync_handler: which set of functions to use to handle sync requests
*/
void
-SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler, bool long_segment_names)
{
@@ -219,7 +216,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
bool found;
shared = (SlruShared) ShmemInitStruct(name,
- SimpleLruShmemSize(nslots, nlsns),
+ SimpleLruShmemSize(nslots),
&found);
if (!IsUnderPostmaster)
@@ -236,7 +233,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
shared->ControlLock = ctllock;
shared->num_slots = nslots;
- shared->lsn_groups_per_page = nlsns;
shared->cur_lru_count = 0;
@@ -261,12 +257,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
shared->buffer_locks = (LWLockPadded *) (ptr + offset);
offset += MAXALIGN(nslots * sizeof(LWLockPadded));
- if (nlsns > 0)
- {
- shared->group_lsn = (XLogRecPtr *) (ptr + offset);
- offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
- }
-
ptr += BUFFERALIGN(offset);
for (slotno = 0; slotno < nslots; slotno++)
{
@@ -281,7 +271,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
}
/* Should fit to estimated shmem size */
- Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
+ Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots));
}
else
Assert(found);
@@ -323,11 +313,8 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
shared->page_dirty[slotno] = true;
SlruRecentlyUsed(shared, slotno);
- /* Set the buffer to zeroes */
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
-
- /* Set the LSNs for this new page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
+ /* Initialize the page. */
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
/* Assume this page is now the latest active page */
shared->latest_page_number = pageno;
@@ -338,26 +325,6 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
return slotno;
}
-/*
- * Zero all the LSNs we store for this slru page.
- *
- * This should be called each time we create a new page, and each time we read
- * in a page from disk into an existing buffer. (Such an old page cannot
- * have any interesting LSNs, since we'd have flushed them before writing
- * the page in the first place.)
- *
- * This assumes that InvalidXLogRecPtr is bitwise-all-0.
- */
-static void
-SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
-{
- SlruShared shared = ctl->shared;
-
- if (shared->lsn_groups_per_page > 0)
- MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
- shared->lsn_groups_per_page * sizeof(XLogRecPtr));
-}
-
/*
* Wait for any active I/O on a page slot to finish. (This does not
* guarantee that new I/O hasn't been started before we return, though.
@@ -478,9 +445,6 @@ SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
/* Do the read */
ok = SlruPhysicalReadPage(ctl, pageno, slotno);
- /* Set the LSNs for this newly read-in page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
-
/* Re-acquire control lock and update page state */
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
@@ -740,7 +704,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
ereport(LOG,
(errmsg("file \"%s\" doesn't exist, reading as zeroes",
path)));
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
return true;
}
@@ -763,6 +727,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
return false;
}
+ if (!PageIsVerifiedExtended(shared->page_buffer[slotno], pageno, PIV_REPORT_STAT))
+ {
+ slru_errcause = SLRU_DATA_CORRUPTED;
+ slru_errno = 0;
+ return false;
+ }
+
return true;
}
@@ -789,6 +760,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd = -1;
+ Page page = shared->page_buffer[slotno];
+ XLogRecPtr lsn;
/* update the stats counter of written pages */
pgstat_count_slru_page_written(shared->slru_stats_idx);
@@ -798,41 +771,18 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
* write out data before associated WAL records. This is the same action
* performed during FlushBuffer() in the main buffer manager.
*/
- if (shared->group_lsn != NULL)
+ lsn = PageGetLSN(page);
+ if (!XLogRecPtrIsInvalid(lsn))
{
/*
- * We must determine the largest async-commit LSN for the page. This
- * is a bit tedious, but since this entire function is a slow path
- * anyway, it seems better to do this here than to maintain a per-page
- * LSN variable (which'd need an extra comparison in the
- * transaction-commit path).
+ * As noted above, elog(ERROR) is not acceptable here, so if
+ * XLogFlush were to fail, we must PANIC. This isn't much of a
+ * restriction because XLogFlush is just about all critical
+ * section anyway, but let's make sure.
*/
- XLogRecPtr max_lsn;
- int lsnindex,
- lsnoff;
-
- lsnindex = slotno * shared->lsn_groups_per_page;
- max_lsn = shared->group_lsn[lsnindex++];
- for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
- {
- XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
-
- if (max_lsn < this_lsn)
- max_lsn = this_lsn;
- }
-
- if (!XLogRecPtrIsInvalid(max_lsn))
- {
- /*
- * As noted above, elog(ERROR) is not acceptable here, so if
- * XLogFlush were to fail, we must PANIC. This isn't much of a
- * restriction because XLogFlush is just about all critical
- * section anyway, but let's make sure.
- */
- START_CRIT_SECTION();
- XLogFlush(max_lsn);
- END_CRIT_SECTION();
- }
+ START_CRIT_SECTION();
+ XLogFlush(lsn);
+ END_CRIT_SECTION();
}
/*
@@ -899,6 +849,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
}
}
+ PageSetChecksumInplace(shared->page_buffer[slotno], pageno);
+
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
@@ -1019,6 +971,13 @@ SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
errdetail("Could not close file \"%s\": %m.",
path)));
break;
+ case SLRU_DATA_CORRUPTED:
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Invalid page from file \"%s\" at offset %d.",
+ path, offset)));
+ break;
default:
/* can't get here, we trust */
elog(ERROR, "unrecognized SimpleLru error cause: %d",
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 64673eaef6..0e07281979 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -32,6 +32,7 @@
#include "access/subtrans.h"
#include "access/transam.h"
#include "pg_trace.h"
+#include "storage/bufpage.h"
#include "utils/snapmgr.h"
@@ -49,7 +50,7 @@
*/
/* We need four bytes per xact */
-#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+#define SUBTRANS_XACTS_PER_PAGE (CapacityOfPageContents / sizeof(TransactionId))
/*
* Although we return an int64 the actual value can't currently exceed
@@ -93,7 +94,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
/*
@@ -133,7 +134,7 @@ SubTransGetParent(TransactionId xid)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
parent = *ptr;
@@ -193,14 +194,14 @@ SubTransGetTopmostTransaction(TransactionId xid)
Size
SUBTRANSShmemSize(void)
{
- return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
+ return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS);
}
void
SUBTRANSShmemInit(void)
{
SubTransCtl->PagePrecedes = SubTransPagePrecedes;
- SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
+ SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS,
SubtransSLRULock, "pg_subtrans",
LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE,
false);
@@ -305,7 +306,6 @@ CheckPointSUBTRANS(void)
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that SUBTRANS has room for a newly-allocated XID.
*
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 264f25a8f9..6b1ff0f515 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -140,6 +140,7 @@
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
@@ -162,7 +163,7 @@
* than that, so changes in that data structure won't affect user-visible
* restrictions.
*/
-#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128)
+#define NOTIFY_PAYLOAD_MAX_LENGTH (CapacityOfPageContents - NAMEDATALEN - 128)
/*
* Struct representing an entry in the global notify queue
@@ -311,7 +312,7 @@ static SlruCtlData NotifyCtlData;
#define NotifyCtl (&NotifyCtlData)
#define QUEUE_PAGESIZE BLCKSZ
-
+#define QUEUE_PAGE_CAPACITY (QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData))
#define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */
/*
@@ -492,7 +493,7 @@ AsyncShmemSize(void)
size = mul_size(MaxBackends + 1, sizeof(QueueBackendStatus));
size = add_size(size, offsetof(AsyncQueueControl, backend));
- size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS));
return size;
}
@@ -541,7 +542,7 @@ AsyncShmemInit(void)
* names are used in order to avoid wraparound.
*/
NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
- SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0,
+ SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS,
NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER,
SYNC_HANDLER_NONE, true);
@@ -1301,14 +1302,14 @@ asyncQueueAdvance(volatile QueuePosition *position, int entryLength)
* written or read.
*/
offset += entryLength;
- Assert(offset <= QUEUE_PAGESIZE);
+ Assert(offset <= QUEUE_PAGE_CAPACITY);
/*
* In a second step check if another entry can possibly be written to the
* page. If so, stay here, we have reached the next position. If not, then
* we need to move on to the next page.
*/
- if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGESIZE)
+ if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGE_CAPACITY)
{
pageno++;
offset = 0;
@@ -1408,7 +1409,7 @@ asyncQueueAddEntries(ListCell *nextNotify)
offset = QUEUE_POS_OFFSET(queue_head);
/* Check whether the entry really fits on the current page */
- if (offset + qe.length <= QUEUE_PAGESIZE)
+ if (offset + qe.length <= QUEUE_PAGE_CAPACITY)
{
/* OK, so advance nextNotify past this item */
nextNotify = lnext(pendingNotifies->events, nextNotify);
@@ -1420,14 +1421,14 @@ asyncQueueAddEntries(ListCell *nextNotify)
* only check dboid and since it won't match any reader's database
* OID, they will ignore this entry and move on.
*/
- qe.length = QUEUE_PAGESIZE - offset;
+ qe.length = QUEUE_PAGE_CAPACITY - offset;
qe.dboid = InvalidOid;
qe.data[0] = '\0'; /* empty channel */
qe.data[1] = '\0'; /* empty payload */
}
/* Now copy qe into the shared buffer page */
- memcpy(NotifyCtl->shared->page_buffer[slotno] + offset,
+ memcpy(PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + offset,
&qe,
qe.length);
@@ -1947,10 +1948,10 @@ asyncQueueReadAllNotifications(void)
else
{
/* fetch all the rest of the page */
- copysize = QUEUE_PAGESIZE - curoffset;
+ copysize = QUEUE_PAGE_CAPACITY - curoffset;
}
- memcpy(page_buffer.buf + curoffset,
- NotifyCtl->shared->page_buffer[slotno] + curoffset,
+ memcpy(PageGetContents(page_buffer.buf) + curoffset,
+ PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + curoffset,
copysize);
/* Release lock that we got from SimpleLruReadPage_ReadOnly() */
LWLockRelease(NotifySLRULock);
@@ -2021,7 +2022,7 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current,
if (QUEUE_POS_EQUAL(thisentry, stop))
break;
- qe = (AsyncQueueEntry *) (page_buffer + QUEUE_POS_OFFSET(thisentry));
+ qe = (AsyncQueueEntry *) (PageGetContents(page_buffer) + QUEUE_POS_OFFSET(thisentry));
/*
* Advance *current over this message, possibly to the next page. As
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index ff8df7c0bc..39129bce98 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -204,6 +204,7 @@
#include "pgstat.h"
#include "port/pg_lfind.h"
#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
#include "storage/predicate.h"
#include "storage/predicate_internals.h"
#include "storage/proc.h"
@@ -322,8 +323,8 @@ static SlruCtlData SerialSlruCtlData;
#define SerialSlruCtl (&SerialSlruCtlData)
#define SERIAL_PAGESIZE BLCKSZ
-#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
-#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE - MAXALIGN(SizeOfPageHeaderData) / SERIAL_ENTRYSIZE)
/*
* Set maximum pages based on the number needed to track all transactions.
@@ -333,7 +334,7 @@ static SlruCtlData SerialSlruCtlData;
#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
- (SerialSlruCtl->shared->page_buffer[slotno] + \
+ (PageGetContents(SerialSlruCtl->shared->page_buffer[slotno]) + \
((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
@@ -785,10 +786,13 @@ SerialPagePrecedesLogicallyUnitTests(void)
* requires burning ~2B XIDs in single-user mode, a negligible
* possibility. Moreover, if it does happen, the consequence would be
* mild, namely a new transaction failing in SimpleLruReadPage().
+ *
+ * NOTE: After adding the page header, the defect affects two pages.
+ * We now assert correct treatment of its second to prior page.
*/
headPage = oldestPage;
targetPage = newestPage;
- Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+ Assert(SerialPagePrecedesLogically(headPage, targetPage - 2));
#if 0
Assert(SerialPagePrecedesLogically(headPage, targetPage));
#endif
@@ -808,7 +812,7 @@ SerialInit(void)
*/
SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
SimpleLruInit(SerialSlruCtl, "Serial",
- NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
+ NUM_SERIAL_BUFFERS, SerialSLRULock, "pg_serial",
LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE,
false);
#ifdef USE_ASSERT_CHECKING
@@ -1348,7 +1352,7 @@ PredicateLockShmemSize(void)
/* Shared memory structures for SLRU tracking of old committed xids. */
size = add_size(size, sizeof(SerialControlData));
- size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS));
return size;
}
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 9a302ddc30..4c002490a2 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -59,6 +59,31 @@ PageInit(Page page, Size pageSize, Size specialSize)
/* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
}
+/*
+ * PageInitSLRU
+ * Initializes the contents of an SLRU page.
+ * Note that we don't calculate an initial checksum here; that's not done
+ * until it's time to write.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_METAPAGE_LAYOUT_VERSION);
+}
/*
* PageIsVerifiedExtended
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 6543d9ce08..cfbc239843 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -16,6 +16,7 @@
#include <dirent.h>
#include <limits.h>
+#include <stdbool.h>
#include <time.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -589,12 +590,20 @@ main(int argc, char *argv[])
{
total_size = scan_directory(DataDir, "global", true);
total_size += scan_directory(DataDir, "base", true);
+ total_size += scan_directory(DataDir, "pg_commit_ts", true);
+ total_size += scan_directory(DataDir, "pg_multixact", true);
+ total_size += scan_directory(DataDir, "pg_serial", true);
total_size += scan_directory(DataDir, "pg_tblspc", true);
+ total_size += scan_directory(DataDir, "pg_xact", true);
}
(void) scan_directory(DataDir, "global", false);
(void) scan_directory(DataDir, "base", false);
+ (void) scan_directory(DataDir, "pg_commit_ts", false);
+ (void) scan_directory(DataDir, "pg_multixact", false);
+ (void) scan_directory(DataDir, "pg_serial", false);
(void) scan_directory(DataDir, "pg_tblspc", false);
+ (void) scan_directory(DataDir, "pg_xact", false);
if (showprogress)
progress_report(true);
diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl
index 18d0882cb1..ae74828e44 100644
--- a/src/bin/pg_resetwal/t/001_basic.pl
+++ b/src/bin/pg_resetwal/t/001_basic.pl
@@ -206,7 +206,7 @@ push @cmd,
sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1]));
@files = get_slru_files('pg_multixact/offsets');
-$mult = 32 * $blcksz / 4;
+$mult = 32 * ($blcksz - 24) / 4;
# -m argument is "new,old"
push @cmd, '-m',
sprintf("%d,%d",
@@ -214,11 +214,11 @@ push @cmd, '-m',
hex($files[0]) == 0 ? 1 : hex($files[0] * $mult));
@files = get_slru_files('pg_multixact/members');
-$mult = 32 * int($blcksz / 20) * 4;
+$mult = 32 * int(($blcksz - 24) / 20) * 4;
push @cmd, '-O', (hex($files[-1]) + 1) * $mult;
@files = get_slru_files('pg_xact');
-$mult = 32 * $blcksz * 4;
+$mult = 32 * ($blcksz - 24) * 4;
push @cmd,
'-u', (hex($files[0]) == 0 ? 3 : hex($files[0]) * $mult),
'-x', ((hex($files[-1]) + 1) * $mult);
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 091e2202c9..24733166b8 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -68,17 +68,6 @@ typedef struct SlruSharedData
int *page_lru_count;
LWLockPadded *buffer_locks;
- /*
- * Optional array of WAL flush LSNs associated with entries in the SLRU
- * pages. If not zero/NULL, we must flush WAL before writing pages (true
- * for pg_xact, false for multixact, pg_subtrans, pg_notify). group_lsn[]
- * has lsn_groups_per_page entries per buffer slot, each containing the
- * highest LSN known for a contiguous group of SLRU entries on that slot's
- * page.
- */
- XLogRecPtr *group_lsn;
- int lsn_groups_per_page;
-
/*----------
* We mark a page "most recently used" by setting
* page_lru_count[slotno] = ++cur_lru_count;
@@ -147,8 +136,8 @@ typedef struct SlruCtlData
typedef SlruCtlData *SlruCtl;
-extern Size SimpleLruShmemSize(int nslots, int nlsns);
-extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+extern Size SimpleLruShmemSize(int nslots);
+extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler,
bool long_segment_names);
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 424ecba028..251d9523fa 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -201,6 +201,7 @@ typedef PageHeaderData *PageHeader;
* handling pages.
*/
#define PG_PAGE_LAYOUT_VERSION 4
+#define PG_SLRU_PAGE_LAYOUT_VERSION 1
#define PG_DATA_CHECKSUM_VERSION 1
/* ----------------------------------------------------------------
@@ -257,6 +258,11 @@ PageGetContents(Page page)
return (char *) page + MAXALIGN(SizeOfPageHeaderData);
}
+/*
+ * Space available for storing page contents.
+ */
+#define SizeOfPageContents (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
/* ----------------
* functions to access page size info
* ----------------
@@ -486,6 +492,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
"BLCKSZ has to be a multiple of sizeof(size_t)");
extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern void PageInitSLRU(Page page, Size pageSize, Size specialSize);
extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags);
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
OffsetNumber offsetNumber, int flags);
diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c
index d0fb9444e8..fea12e0080 100644
--- a/src/test/modules/test_slru/test_slru.c
+++ b/src/test/modules/test_slru/test_slru.c
@@ -17,6 +17,7 @@
#include "access/slru.h"
#include "access/transam.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
@@ -76,8 +77,8 @@ test_slru_page_write(PG_FUNCTION_ARGS)
TestSlruCtl->shared->page_status[slotno] = SLRU_PAGE_VALID;
/* write given data to the page, up to the limit of the page */
- strncpy(TestSlruCtl->shared->page_buffer[slotno], data,
- BLCKSZ - 1);
+ strncpy(PageGetContents(TestSlruCtl->shared->page_buffer[slotno]), data,
+ CapacityOfPageContents - 1);
SimpleLruWritePage(TestSlruCtl, slotno);
LWLockRelease(TestSLRULock);
@@ -104,7 +105,7 @@ test_slru_page_read(PG_FUNCTION_ARGS)
LWLockAcquire(TestSLRULock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(TestSlruCtl, pageno,
write_ok, InvalidTransactionId);
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(TestSLRULock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -122,7 +123,7 @@ test_slru_page_readonly(PG_FUNCTION_ARGS)
pageno,
InvalidTransactionId);
Assert(LWLockHeldByMe(TestSLRULock));
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(TestSLRULock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -202,7 +203,7 @@ test_slru_shmem_request(void)
prev_shmem_request_hook();
/* reserve shared memory for the test SLRU */
- RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS, 0));
+ RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS));
}
static bool
@@ -238,7 +239,7 @@ test_slru_shmem_startup(void)
TestSlruCtl->PagePrecedes = test_slru_page_precedes_logically;
SimpleLruInit(TestSlruCtl, "TestSLRU",
- NUM_TEST_BUFFERS, 0, TestSLRULock, slru_dir_name,
+ NUM_TEST_BUFFERS, TestSLRULock, slru_dir_name,
test_tranche_id, SYNC_HANDLER_NONE, long_segment_names);
}
Hi Yong!
+1 to the idea to protect SLRUs from corruption. I'm slightly leaning towards the idea of separating checksums from data pages, but anyway this checksums are better than no checksums.
On 7 Dec 2023, at 10:06, Li, Yong <yoli@ebay.com> wrote:
I am still working on patching the pg_upgrade. Just love to hear your thoughts on the idea and the current patch.
FWIW you can take upgrade code from this patch [0]/messages/by-id/B8F85F94-ECB0-484E-96B8-21D928C8E711@yandex-team.ru doing all the same stuff :)
Best regards, Andrey Borodin.
[0]: /messages/by-id/B8F85F94-ECB0-484E-96B8-21D928C8E711@yandex-team.ru
Hi,
+1 to the idea to protect SLRUs from corruption. I'm slightly leaning towards the idea of separating checksums from data pages, but anyway this checksums are better than no checksums.
On 7 Dec 2023, at 10:06, Li, Yong <yoli@ebay.com> wrote:
I am still working on patching the pg_upgrade. Just love to hear your thoughts on the idea and the current patch.
FWIW you can take upgrade code from this patch [0] doing all the same stuff :)
Sounds like a half-measure to me. If we really want to go down this
rabbit hole IMO SLRU should be moved to shared buffers as proposed
elsewhere [1]/messages/by-id/EFAAC0BE-27E9-4186-B925-79B7C696D5AC@amazon.com.
[1]: /messages/by-id/EFAAC0BE-27E9-4186-B925-79B7C696D5AC@amazon.com
--
Best regards,
Aleksander Alekseev
<br /><br /><br />07.12.2023, 19:17, "Aleksander Alekseev" <aleksander@timescale.com>:<br /><blockquote><p>Hi,<br /><br /></p><blockquote class="210e7a848e8fcb45wmi-quote"> +1 to the idea to protect SLRUs from corruption. I'm slightly leaning towards the idea of separating checksums from data pages, but anyway this checksums are better than no checksums.<br /><br /> On 7 Dec 2023, at 10:06, Li, Yong <<a href="mailto:yoli@ebay.com">yoli@ebay.com</a>> wrote:<br /><br /> I am still working on patching the pg_upgrade. Just love to hear your thoughts on the idea and the current patch.<br /><br /> FWIW you can take upgrade code from this patch [0] doing all the same stuff :)<br /></blockquote><p><br />Sounds like a half-measure to me. If we really want to go down this<br />rabbit hole IMO SLRU should be moved to shared buffers as proposed<br />elsewhere [1].<br /></p></blockquote>Thread that I cited stopped in 2018 for this exact reason. 5 years ago. Is this argument still valid?<div>Meanwhile checksums of buffer pages also reside on a page :)</div><div><br /></div><div>Best regards, Andrey Borodin.</div>
Sounds like a half-measure to me. If we really want to go down this
rabbit hole IMO SLRU should be moved to shared buffers as proposed
elsewhere [1].
Thread that I cited stopped in 2018 for this exact reason. 5 years ago. Is this argument still valid?
Meanwhile checksums of buffer pages also reside on a page :)
I would love to have seen more progress on the set of threads that proposed
the page header and integration of SLRU into buffer cache. The changes were
large, and unfortunately as a result, it didn't get the detailed review
that it needed. The complex nature of the feature allowed for more branches
to be split from the main thread with alternative approaches. Athough this is
great to see, it did result in the set of core requirements around LSN and
checksum tracking via page headers to not get into PG 16.
What is being proposed now is the simple and core functionality of introducing
page headers to SLRU pages while continuing to be in the SLRU cache. This
allows the whole project to be iterative and reviewers to better reason about
the smaller set of changes being introduced into the codebase.
Once the set of on-disk changes are in, we can follow up on optimizations.
It may be moving to buffer cache or reviewing Dilip's approach in [1]/messages/by-id/CAFiTN-vzDvNz=ExGXz6gdyjtzGixKSqs0mKHMmaQ8sOSEFZ33A@mail.gmail.com, we
will have the option to be flexible in our approach.
[1]: /messages/by-id/CAFiTN-vzDvNz=ExGXz6gdyjtzGixKSqs0mKHMmaQ8sOSEFZ33A@mail.gmail.com
Shawn
On Thu, Dec 7, 2023 at 1:28 PM Debnath, Shawn <sdn@ebay.com> wrote:
What is being proposed now is the simple and core functionality of introducing
page headers to SLRU pages while continuing to be in the SLRU cache. This
allows the whole project to be iterative and reviewers to better reason about
the smaller set of changes being introduced into the codebase.Once the set of on-disk changes are in, we can follow up on optimizations.
It may be moving to buffer cache or reviewing Dilip's approach in [1], we
will have the option to be flexible in our approach.
I basically agree with this. I don't think we should let the perfect
be the enemy of the good. Shooting down this patch because it doesn't
do everything that we want is a recipe for getting nothing done at
all.
That said, I don't think that the original post on this thread
provides a sufficiently clear and detailed motivation for making this
change. For this to eventually be committed, it's going to need (among
other things) a commit message that articulates a convincing rationale
for whatever changes it makes. Here's what the original email said:
It adds a checksum to each SLRU page, tracks page LSN as if it is a standard page and eases future page enhancements.
Of those three things, in my opinion, the first is good and the other
two are too vague. I assume that most people who would be likely to
read a commit message would understand the value of pages having
checksums. But I can't immediately think of what the value of tracking
the page LSN as if it were a standard page might be, so that probably
needs more explanation. Similarly, at least one or two of the future
page enhancements that might be eased should be spelled out, and/or
the ways in which they would be made easier should be articulated.
--
Robert Haas
EDB: http://www.enterprisedb.com
Given so many different approaches were discussed, I have started a wiki to record and collaborate all efforts towards SLRU improvements. The wiki provides a concise overview of all the ideas discussed and can serve as a portal for all historical discussions. Currently, the wiki summarizes four recent threads ranging from identifier format change to page header change, to moving SLRU into the main buffer pool, to reduce lock contention on SLRU latches. We can keep the patch related discussions in this thread and use the wiki as a live document for larger scale collaborations.
The wiki page is here: https://wiki.postgresql.org/wiki/SLRU_improvements
Regarding the benefits of this patch, here is a detailed explanation:
1. Checksum is added to each page, allowing us to verify if a page has been corrupted when read from the disk.
2. The ad-hoc LSN group structure is removed from the SLRU cache control data and is replaced by the page LSN in the page header. This allows us to use the same WAL protocol as used by pages in the main buffer pool: flush all redo logs up to the page LSN before flushing the page itself. If we move SLRU caches into the main buffer pool, this change fits naturally.
3. It leaves further optimizations open. We can continue to pursue the goal of moving SLRU into the main buffer pool, or we can follow the lock partition idea. This change by itself does not conflict with either proposal.
Also, the patch is now complete and is ready for review. All check-world tests including tap tests passed with this patch.
Regards,
Yong
From: Robert Haas <robertmhaas@gmail.com>
Date: Friday, December 8, 2023 at 03:51
To: Debnath, Shawn <sdn@ebay.com>
Cc: Andrey Borodin <x4mmm@yandex-team.ru>, PostgreSQL Hackers <pgsql-hackers@lists.postgresql.org>, Aleksander Alekseev <aleksander@timescale.com>, Li, Yong <yoli@ebay.com>, Shyrabokau, Anton <antons@ebay.com>, Bagga, Rishu <bagrishu@amazon.com>
Subject: Re: Proposal to add page headers to SLRU pages
External Email
On Thu, Dec 7, 2023 at 1:28 PM Debnath, Shawn <sdn@ebay.com> wrote:
What is being proposed now is the simple and core functionality of introducing
page headers to SLRU pages while continuing to be in the SLRU cache. This
allows the whole project to be iterative and reviewers to better reason about
the smaller set of changes being introduced into the codebase.Once the set of on-disk changes are in, we can follow up on optimizations.
It may be moving to buffer cache or reviewing Dilip's approach in [1], we
will have the option to be flexible in our approach.
I basically agree with this. I don't think we should let the perfect
be the enemy of the good. Shooting down this patch because it doesn't
do everything that we want is a recipe for getting nothing done at
all.
That said, I don't think that the original post on this thread
provides a sufficiently clear and detailed motivation for making this
change. For this to eventually be committed, it's going to need (among
other things) a commit message that articulates a convincing rationale
for whatever changes it makes. Here's what the original email said:
It adds a checksum to each SLRU page, tracks page LSN as if it is a standard page and eases future page enhancements.
Of those three things, in my opinion, the first is good and the other
two are too vague. I assume that most people who would be likely to
read a commit message would understand the value of pages having
checksums. But I can't immediately think of what the value of tracking
the page LSN as if it were a standard page might be, so that probably
needs more explanation. Similarly, at least one or two of the future
page enhancements that might be eased should be spelled out, and/or
the ways in which they would be made easier should be articulated.
Attachments:
slru_page_header_v1.patchapplication/octet-stream; name=slru_page_header_v1.patchDownload
src/backend/access/transam/clog.c | 52 ++++-----
src/backend/access/transam/commit_ts.c | 26 +++--
src/backend/access/transam/multixact.c | 63 ++++++----
src/backend/access/transam/slru.c | 113 ++++++------------
src/backend/access/transam/subtrans.c | 12 +-
src/backend/commands/async.c | 27 ++---
src/backend/storage/lmgr/predicate.c | 16 ++-
src/backend/storage/page/bufpage.c | 25 ++++
src/bin/pg_checksums/pg_checksums.c | 9 ++
src/bin/pg_resetwal/t/001_basic.pl | 6 +-
src/bin/pg_upgrade/Makefile | 1 +
src/bin/pg_upgrade/meson.build | 1 +
src/bin/pg_upgrade/nls.mk | 1 +
src/bin/pg_upgrade/pg_upgrade.c | 23 +++-
src/bin/pg_upgrade/pg_upgrade.h | 10 ++
src/bin/pg_upgrade/slru.c | 205 +++++++++++++++++++++++++++++++++
src/include/access/slru.h | 15 +--
src/include/catalog/catversion.h | 2 +-
src/include/storage/bufpage.h | 7 ++
src/test/modules/test_slru/test_slru.c | 13 ++-
20 files changed, 435 insertions(+), 192 deletions(-)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index cc60eab1e2..33a0bcaea9 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -41,6 +41,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/proc.h"
#include "storage/sync.h"
@@ -59,7 +60,7 @@
/* We need two bits per xact, so four xacts fit in a byte */
#define CLOG_BITS_PER_XACT 2
#define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACTS_PER_PAGE (SizeOfPageContents * CLOG_XACTS_PER_BYTE)
#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
@@ -77,13 +78,6 @@ TransactionIdToPage(TransactionId xid)
#define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
#define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
-/* We store the latest async LSN for each group of transactions */
-#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
-#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
-
-#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
- ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
-
/*
* The number of subtransactions below which we consider to apply clog group
* update optimization. Testing reveals that the number higher than this can
@@ -101,7 +95,7 @@ static SlruCtlData XactCtlData;
static int ZeroCLOGPage(int64 pageno, bool writeXlog);
static bool CLOGPagePrecedes(int64 page1, int64 page2);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
Oid oldestXactDb);
static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
@@ -583,8 +577,9 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
char *byteptr;
char byteval;
char curval;
+ Page page = XactCtl->shared->page_buffer[slotno];
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(page) + byteno;
curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
/*
@@ -613,7 +608,7 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*byteptr = byteval;
/*
- * Update the group LSN if the transaction completion LSN is higher.
+ * Update the page LSN if the transaction completion LSN is higher.
*
* Note: lsn will be invalid when supplied during InRecovery processing,
* so we don't need to do anything special to avoid LSN updates during
@@ -622,10 +617,8 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*/
if (!XLogRecPtrIsInvalid(lsn))
{
- int lsnindex = GetLSNIndex(slotno, xid);
-
- if (XactCtl->shared->group_lsn[lsnindex] < lsn)
- XactCtl->shared->group_lsn[lsnindex] = lsn;
+ if (PageGetLSN(page) < lsn)
+ PageSetLSN(page, lsn);
}
}
@@ -651,19 +644,19 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
int byteno = TransactionIdToByte(xid);
int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
int slotno;
- int lsnindex;
+ Page page;
char *byteptr;
XidStatus status;
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
- lsnindex = GetLSNIndex(slotno, xid);
- *lsn = XactCtl->shared->group_lsn[lsnindex];
+ *lsn = PageGetLSN(page);
LWLockRelease(XactSLRULock);
@@ -698,14 +691,14 @@ CLOGShmemBuffers(void)
Size
CLOGShmemSize(void)
{
- return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
+ return SimpleLruShmemSize(CLOGShmemBuffers());
}
void
CLOGShmemInit(void)
{
XactCtl->PagePrecedes = CLOGPagePrecedes;
- SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
+ SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(),
XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
SYNC_HANDLER_CLOG, false);
SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
@@ -747,11 +740,17 @@ static int
ZeroCLOGPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(XactCtl, pageno);
+ page = XactCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -807,12 +806,12 @@ TrimCLOG(void)
char *byteptr;
slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(XactCtl->shared->page_buffer[slotno]) + byteno;
/* Zero so-far-unused positions in the current byte */
*byteptr &= (1 << bshift) - 1;
/* Zero the rest of the page */
- MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+ MemSet(byteptr + 1, 0, SizeOfPageContents - byteno - 1);
XactCtl->shared->page_dirty[slotno] = true;
}
@@ -836,7 +835,6 @@ CheckPointCLOG(void)
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that CLOG has room for a newly-allocated XID.
*
@@ -958,12 +956,12 @@ CLOGPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
+ return XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 7c642f7b59..6b36ee1022 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -31,6 +31,7 @@
#include "funcapi.h"
#include "miscadmin.h"
#include "pg_trace.h"
+#include "storage/bufpage.h"
#include "storage/shmem.h"
#include "utils/builtins.h"
#include "utils/snapmgr.h"
@@ -63,7 +64,7 @@ typedef struct CommitTimestampEntry
sizeof(RepOriginId))
#define COMMIT_TS_XACTS_PER_PAGE \
- (BLCKSZ / SizeOfCommitTimestampEntry)
+ (SizeOfPageContents / SizeOfCommitTimestampEntry)
/*
@@ -120,7 +121,7 @@ static int ZeroCommitTsPage(int64 pageno, bool writeXlog);
static bool CommitTsPagePrecedes(int64 page1, int64 page2);
static void ActivateCommitTs(void);
static void DeactivateCommitTs(void);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid);
/*
@@ -254,11 +255,12 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
CommitTimestampEntry entry;
Assert(TransactionIdIsNormal(xid));
+ Assert(xid == slotno * COMMIT_TS_XACTS_PER_PAGE + entryno);
entry.time = ts;
entry.nodeid = nodeid;
- memcpy(CommitTsCtl->shared->page_buffer[slotno] +
+ memcpy(PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
&entry, SizeOfCommitTimestampEntry);
}
@@ -337,7 +339,7 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
memcpy(&entry,
- CommitTsCtl->shared->page_buffer[slotno] +
+ PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
SizeOfCommitTimestampEntry);
@@ -515,7 +517,7 @@ CommitTsShmemBuffers(void)
Size
CommitTsShmemSize(void)
{
- return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
+ return SimpleLruShmemSize(CommitTsShmemBuffers()) +
sizeof(CommitTimestampShared);
}
@@ -529,7 +531,7 @@ CommitTsShmemInit(void)
bool found;
CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
- SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0,
+ SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(),
CommitTsSLRULock, "pg_commit_ts",
LWTRANCHE_COMMITTS_BUFFER,
SYNC_HANDLER_COMMIT_TS,
@@ -582,11 +584,17 @@ static int
ZeroCommitTsPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+ page = CommitTsCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -946,12 +954,12 @@ CommitTsPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
+ return XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index db3423f12e..0970aa8855 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -83,6 +83,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "postmaster/autovacuum.h"
+#include "storage/bufpage.h"
#include "storage/lmgr.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
@@ -106,7 +107,7 @@
*/
/* We need four bytes per offset */
-#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+#define MULTIXACT_OFFSETS_PER_PAGE (SizeOfPageContents / sizeof(MultiXactOffset))
#define MultiXactIdToOffsetPage(xid) \
((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
@@ -119,8 +120,8 @@
* additional flag bits for each TransactionId. To do this without getting
* into alignment issues, we store four bytes of flags, and then the
* corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
- * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
- * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 408 groups
+ * per page. This wastes 8 bytes per page, but that's OK -- simplicity (and
* performance) trumps space efficiency here.
*
* Note that the "offset" macros work with byte offset, not array indexes, so
@@ -138,7 +139,7 @@
/* size in bytes of a complete group */
#define MULTIXACT_MEMBERGROUP_SIZE \
(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
-#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (SizeOfPageContents / MULTIXACT_MEMBERGROUP_SIZE)
#define MULTIXACT_MEMBERS_PER_PAGE \
(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
@@ -366,7 +367,7 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
MultiXactOffset start, uint32 distance);
static bool SetOffsetVacuumLimit(bool is_startup);
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
-static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
+static XLogRecPtr WriteMZeroPageXlogRec(int64 pageno, uint8 info);
static void WriteMTruncateXlogRec(Oid oldestMultiDB,
MultiXactId startTruncOff,
MultiXactId endTruncOff,
@@ -884,7 +885,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
* take the trouble to generalize the slru.c error reporting code.
*/
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
*offptr = offset;
@@ -921,12 +922,12 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
}
memberptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
*memberptr = members[i].xid;
flagsptr = (uint32 *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
flagsval = *flagsptr;
flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
@@ -1348,7 +1349,7 @@ retry:
entryno = MultiXactIdToOffsetEntry(multi);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
@@ -1381,7 +1382,7 @@ retry:
if (pageno != prev_pageno)
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
nextMXOffset = *offptr;
@@ -1424,7 +1425,7 @@ retry:
}
xactptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
if (!TransactionIdIsValid(*xactptr))
{
@@ -1435,7 +1436,7 @@ retry:
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
- flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ flagsptr = (uint32 *) (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
ptr[truelength].xid = *xactptr;
ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
@@ -1834,8 +1835,8 @@ MultiXactShmemSize(void)
mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
size = SHARED_MULTIXACT_STATE_SIZE;
- size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0));
- size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS));
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS));
return size;
}
@@ -1851,14 +1852,14 @@ MultiXactShmemInit(void)
MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
SimpleLruInit(MultiXactOffsetCtl,
- "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
+ "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS,
MultiXactOffsetSLRULock, "pg_multixact/offsets",
LWTRANCHE_MULTIXACTOFFSET_BUFFER,
SYNC_HANDLER_MULTIXACT_OFFSET,
false);
SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
SimpleLruInit(MultiXactMemberCtl,
- "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
+ "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS,
MultiXactMemberSLRULock, "pg_multixact/members",
LWTRANCHE_MULTIXACTMEMBER_BUFFER,
SYNC_HANDLER_MULTIXACT_MEMBER,
@@ -1933,11 +1934,17 @@ static int
ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+ page = MultiXactOffsetCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -1949,11 +1956,17 @@ static int
ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
+ page = MultiXactMemberCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2071,10 +2084,10 @@ TrimMultiXact(void)
MultiXactOffset *offptr;
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
- MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+ MemSet(offptr, 0, SizeOfPageContents - (entryno * sizeof(MultiXactOffset)));
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
}
@@ -2104,9 +2117,9 @@ TrimMultiXact(void)
memberoff = MXOffsetToMemberOffset(offset);
slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
xidptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
- MemSet(xidptr, 0, BLCKSZ - memberoff);
+ MemSet(xidptr, 0, SizeOfPageContents - memberoff);
/*
* Note: we don't need to zero out the flag bits in the remaining
@@ -2758,7 +2771,7 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
LWLockRelease(MultiXactOffsetSLRULock);
@@ -3192,12 +3205,12 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
* Write an xlog record reflecting the zeroing of either a MEMBERs or
* OFFSETs page (info shows which)
*/
-static void
+static XLogRecPtr
WriteMZeroPageXlogRec(int64 pageno, uint8 info)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_MULTIXACT_ID, info);
+ return XLogInsert(RM_MULTIXACT_ID, info);
}
/*
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 7a371d9034..d7f3deea7d 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -57,6 +57,7 @@
#include "access/xlogutils.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/shmem.h"
@@ -154,13 +155,13 @@ typedef enum
SLRU_WRITE_FAILED,
SLRU_FSYNC_FAILED,
SLRU_CLOSE_FAILED,
+ SLRU_DATA_CORRUPTED,
} SlruErrorCause;
static SlruErrorCause slru_errcause;
static int slru_errno;
-static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
@@ -179,7 +180,7 @@ static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
*/
Size
-SimpleLruShmemSize(int nslots, int nlsns)
+SimpleLruShmemSize(int nslots)
{
Size sz;
@@ -192,9 +193,6 @@ SimpleLruShmemSize(int nslots, int nlsns)
sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
- if (nlsns > 0)
- sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
-
return BUFFERALIGN(sz) + BLCKSZ * nslots;
}
@@ -204,14 +202,13 @@ SimpleLruShmemSize(int nslots, int nlsns)
* ctl: address of local (unshared) control structure.
* name: name of SLRU. (This is user-visible, pick with care!)
* nslots: number of page slots to use.
- * nlsns: number of LSN groups per page (set to zero if not relevant).
* ctllock: LWLock to use to control access to the shared control structure.
* subdir: PGDATA-relative subdirectory that will contain the files.
* tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks.
* sync_handler: which set of functions to use to handle sync requests
*/
void
-SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler, bool long_segment_names)
{
@@ -219,7 +216,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
bool found;
shared = (SlruShared) ShmemInitStruct(name,
- SimpleLruShmemSize(nslots, nlsns),
+ SimpleLruShmemSize(nslots),
&found);
if (!IsUnderPostmaster)
@@ -236,7 +233,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
shared->ControlLock = ctllock;
shared->num_slots = nslots;
- shared->lsn_groups_per_page = nlsns;
shared->cur_lru_count = 0;
@@ -261,12 +257,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
shared->buffer_locks = (LWLockPadded *) (ptr + offset);
offset += MAXALIGN(nslots * sizeof(LWLockPadded));
- if (nlsns > 0)
- {
- shared->group_lsn = (XLogRecPtr *) (ptr + offset);
- offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
- }
-
ptr += BUFFERALIGN(offset);
for (slotno = 0; slotno < nslots; slotno++)
{
@@ -281,7 +271,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
}
/* Should fit to estimated shmem size */
- Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
+ Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots));
}
else
Assert(found);
@@ -323,11 +313,8 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
shared->page_dirty[slotno] = true;
SlruRecentlyUsed(shared, slotno);
- /* Set the buffer to zeroes */
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
-
- /* Set the LSNs for this new page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
+ /* Initialize the page. */
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
/* Assume this page is now the latest active page */
shared->latest_page_number = pageno;
@@ -338,26 +325,6 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
return slotno;
}
-/*
- * Zero all the LSNs we store for this slru page.
- *
- * This should be called each time we create a new page, and each time we read
- * in a page from disk into an existing buffer. (Such an old page cannot
- * have any interesting LSNs, since we'd have flushed them before writing
- * the page in the first place.)
- *
- * This assumes that InvalidXLogRecPtr is bitwise-all-0.
- */
-static void
-SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
-{
- SlruShared shared = ctl->shared;
-
- if (shared->lsn_groups_per_page > 0)
- MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
- shared->lsn_groups_per_page * sizeof(XLogRecPtr));
-}
-
/*
* Wait for any active I/O on a page slot to finish. (This does not
* guarantee that new I/O hasn't been started before we return, though.
@@ -478,9 +445,6 @@ SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
/* Do the read */
ok = SlruPhysicalReadPage(ctl, pageno, slotno);
- /* Set the LSNs for this newly read-in page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
-
/* Re-acquire control lock and update page state */
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
@@ -740,7 +704,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
ereport(LOG,
(errmsg("file \"%s\" doesn't exist, reading as zeroes",
path)));
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
return true;
}
@@ -763,6 +727,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
return false;
}
+ if (!PageIsVerifiedExtended(shared->page_buffer[slotno], pageno, PIV_REPORT_STAT))
+ {
+ slru_errcause = SLRU_DATA_CORRUPTED;
+ slru_errno = 0;
+ return false;
+ }
+
return true;
}
@@ -789,6 +760,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd = -1;
+ Page page = shared->page_buffer[slotno];
+ XLogRecPtr lsn;
/* update the stats counter of written pages */
pgstat_count_slru_page_written(shared->slru_stats_idx);
@@ -798,41 +771,18 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
* write out data before associated WAL records. This is the same action
* performed during FlushBuffer() in the main buffer manager.
*/
- if (shared->group_lsn != NULL)
+ lsn = PageGetLSN(page);
+ if (!XLogRecPtrIsInvalid(lsn))
{
/*
- * We must determine the largest async-commit LSN for the page. This
- * is a bit tedious, but since this entire function is a slow path
- * anyway, it seems better to do this here than to maintain a per-page
- * LSN variable (which'd need an extra comparison in the
- * transaction-commit path).
+ * As noted above, elog(ERROR) is not acceptable here, so if
+ * XLogFlush were to fail, we must PANIC. This isn't much of a
+ * restriction because XLogFlush is just about all critical
+ * section anyway, but let's make sure.
*/
- XLogRecPtr max_lsn;
- int lsnindex,
- lsnoff;
-
- lsnindex = slotno * shared->lsn_groups_per_page;
- max_lsn = shared->group_lsn[lsnindex++];
- for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
- {
- XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
-
- if (max_lsn < this_lsn)
- max_lsn = this_lsn;
- }
-
- if (!XLogRecPtrIsInvalid(max_lsn))
- {
- /*
- * As noted above, elog(ERROR) is not acceptable here, so if
- * XLogFlush were to fail, we must PANIC. This isn't much of a
- * restriction because XLogFlush is just about all critical
- * section anyway, but let's make sure.
- */
- START_CRIT_SECTION();
- XLogFlush(max_lsn);
- END_CRIT_SECTION();
- }
+ START_CRIT_SECTION();
+ XLogFlush(lsn);
+ END_CRIT_SECTION();
}
/*
@@ -899,6 +849,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
}
}
+ PageSetChecksumInplace(shared->page_buffer[slotno], pageno);
+
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
@@ -1019,6 +971,13 @@ SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
errdetail("Could not close file \"%s\": %m.",
path)));
break;
+ case SLRU_DATA_CORRUPTED:
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Invalid page from file \"%s\" at offset %d.",
+ path, offset)));
+ break;
default:
/* can't get here, we trust */
elog(ERROR, "unrecognized SimpleLru error cause: %d",
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 64673eaef6..c4a912171f 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -32,6 +32,7 @@
#include "access/subtrans.h"
#include "access/transam.h"
#include "pg_trace.h"
+#include "storage/bufpage.h"
#include "utils/snapmgr.h"
@@ -49,7 +50,7 @@
*/
/* We need four bytes per xact */
-#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+#define SUBTRANS_XACTS_PER_PAGE (SizeOfPageContents / sizeof(TransactionId))
/*
* Although we return an int64 the actual value can't currently exceed
@@ -93,7 +94,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
/*
@@ -133,7 +134,7 @@ SubTransGetParent(TransactionId xid)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
parent = *ptr;
@@ -193,14 +194,14 @@ SubTransGetTopmostTransaction(TransactionId xid)
Size
SUBTRANSShmemSize(void)
{
- return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
+ return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS);
}
void
SUBTRANSShmemInit(void)
{
SubTransCtl->PagePrecedes = SubTransPagePrecedes;
- SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
+ SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS,
SubtransSLRULock, "pg_subtrans",
LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE,
false);
@@ -305,7 +306,6 @@ CheckPointSUBTRANS(void)
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that SUBTRANS has room for a newly-allocated XID.
*
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 264f25a8f9..85b0d63cdc 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -140,6 +140,7 @@
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
@@ -162,7 +163,7 @@
* than that, so changes in that data structure won't affect user-visible
* restrictions.
*/
-#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128)
+#define NOTIFY_PAYLOAD_MAX_LENGTH (SizeOfPageContents - NAMEDATALEN - 128)
/*
* Struct representing an entry in the global notify queue
@@ -311,7 +312,7 @@ static SlruCtlData NotifyCtlData;
#define NotifyCtl (&NotifyCtlData)
#define QUEUE_PAGESIZE BLCKSZ
-
+#define QUEUE_PAGE_CAPACITY (QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData))
#define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */
/*
@@ -492,7 +493,7 @@ AsyncShmemSize(void)
size = mul_size(MaxBackends + 1, sizeof(QueueBackendStatus));
size = add_size(size, offsetof(AsyncQueueControl, backend));
- size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS));
return size;
}
@@ -541,7 +542,7 @@ AsyncShmemInit(void)
* names are used in order to avoid wraparound.
*/
NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
- SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0,
+ SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS,
NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER,
SYNC_HANDLER_NONE, true);
@@ -1301,14 +1302,14 @@ asyncQueueAdvance(volatile QueuePosition *position, int entryLength)
* written or read.
*/
offset += entryLength;
- Assert(offset <= QUEUE_PAGESIZE);
+ Assert(offset <= QUEUE_PAGE_CAPACITY);
/*
* In a second step check if another entry can possibly be written to the
* page. If so, stay here, we have reached the next position. If not, then
* we need to move on to the next page.
*/
- if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGESIZE)
+ if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGE_CAPACITY)
{
pageno++;
offset = 0;
@@ -1408,7 +1409,7 @@ asyncQueueAddEntries(ListCell *nextNotify)
offset = QUEUE_POS_OFFSET(queue_head);
/* Check whether the entry really fits on the current page */
- if (offset + qe.length <= QUEUE_PAGESIZE)
+ if (offset + qe.length <= QUEUE_PAGE_CAPACITY)
{
/* OK, so advance nextNotify past this item */
nextNotify = lnext(pendingNotifies->events, nextNotify);
@@ -1420,14 +1421,14 @@ asyncQueueAddEntries(ListCell *nextNotify)
* only check dboid and since it won't match any reader's database
* OID, they will ignore this entry and move on.
*/
- qe.length = QUEUE_PAGESIZE - offset;
+ qe.length = QUEUE_PAGE_CAPACITY - offset;
qe.dboid = InvalidOid;
qe.data[0] = '\0'; /* empty channel */
qe.data[1] = '\0'; /* empty payload */
}
/* Now copy qe into the shared buffer page */
- memcpy(NotifyCtl->shared->page_buffer[slotno] + offset,
+ memcpy(PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + offset,
&qe,
qe.length);
@@ -1947,10 +1948,10 @@ asyncQueueReadAllNotifications(void)
else
{
/* fetch all the rest of the page */
- copysize = QUEUE_PAGESIZE - curoffset;
+ copysize = QUEUE_PAGE_CAPACITY - curoffset;
}
- memcpy(page_buffer.buf + curoffset,
- NotifyCtl->shared->page_buffer[slotno] + curoffset,
+ memcpy(PageGetContents(page_buffer.buf) + curoffset,
+ PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + curoffset,
copysize);
/* Release lock that we got from SimpleLruReadPage_ReadOnly() */
LWLockRelease(NotifySLRULock);
@@ -2021,7 +2022,7 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current,
if (QUEUE_POS_EQUAL(thisentry, stop))
break;
- qe = (AsyncQueueEntry *) (page_buffer + QUEUE_POS_OFFSET(thisentry));
+ qe = (AsyncQueueEntry *) (PageGetContents(page_buffer) + QUEUE_POS_OFFSET(thisentry));
/*
* Advance *current over this message, possibly to the next page. As
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index ff8df7c0bc..39129bce98 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -204,6 +204,7 @@
#include "pgstat.h"
#include "port/pg_lfind.h"
#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
#include "storage/predicate.h"
#include "storage/predicate_internals.h"
#include "storage/proc.h"
@@ -322,8 +323,8 @@ static SlruCtlData SerialSlruCtlData;
#define SerialSlruCtl (&SerialSlruCtlData)
#define SERIAL_PAGESIZE BLCKSZ
-#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
-#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE - MAXALIGN(SizeOfPageHeaderData) / SERIAL_ENTRYSIZE)
/*
* Set maximum pages based on the number needed to track all transactions.
@@ -333,7 +334,7 @@ static SlruCtlData SerialSlruCtlData;
#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
- (SerialSlruCtl->shared->page_buffer[slotno] + \
+ (PageGetContents(SerialSlruCtl->shared->page_buffer[slotno]) + \
((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
@@ -785,10 +786,13 @@ SerialPagePrecedesLogicallyUnitTests(void)
* requires burning ~2B XIDs in single-user mode, a negligible
* possibility. Moreover, if it does happen, the consequence would be
* mild, namely a new transaction failing in SimpleLruReadPage().
+ *
+ * NOTE: After adding the page header, the defect affects two pages.
+ * We now assert correct treatment of its second to prior page.
*/
headPage = oldestPage;
targetPage = newestPage;
- Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+ Assert(SerialPagePrecedesLogically(headPage, targetPage - 2));
#if 0
Assert(SerialPagePrecedesLogically(headPage, targetPage));
#endif
@@ -808,7 +812,7 @@ SerialInit(void)
*/
SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
SimpleLruInit(SerialSlruCtl, "Serial",
- NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
+ NUM_SERIAL_BUFFERS, SerialSLRULock, "pg_serial",
LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE,
false);
#ifdef USE_ASSERT_CHECKING
@@ -1348,7 +1352,7 @@ PredicateLockShmemSize(void)
/* Shared memory structures for SLRU tracking of old committed xids. */
size = add_size(size, sizeof(SerialControlData));
- size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS));
return size;
}
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 9a302ddc30..723a127594 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -59,6 +59,31 @@ PageInit(Page page, Size pageSize, Size specialSize)
/* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
}
+/*
+ * PageInitSLRU
+ * Initializes the contents of an SLRU page.
+ * Note that we don't calculate an initial checksum here; that's not done
+ * until it's time to write.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
/*
* PageIsVerifiedExtended
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 6543d9ce08..cfbc239843 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -16,6 +16,7 @@
#include <dirent.h>
#include <limits.h>
+#include <stdbool.h>
#include <time.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -589,12 +590,20 @@ main(int argc, char *argv[])
{
total_size = scan_directory(DataDir, "global", true);
total_size += scan_directory(DataDir, "base", true);
+ total_size += scan_directory(DataDir, "pg_commit_ts", true);
+ total_size += scan_directory(DataDir, "pg_multixact", true);
+ total_size += scan_directory(DataDir, "pg_serial", true);
total_size += scan_directory(DataDir, "pg_tblspc", true);
+ total_size += scan_directory(DataDir, "pg_xact", true);
}
(void) scan_directory(DataDir, "global", false);
(void) scan_directory(DataDir, "base", false);
+ (void) scan_directory(DataDir, "pg_commit_ts", false);
+ (void) scan_directory(DataDir, "pg_multixact", false);
+ (void) scan_directory(DataDir, "pg_serial", false);
(void) scan_directory(DataDir, "pg_tblspc", false);
+ (void) scan_directory(DataDir, "pg_xact", false);
if (showprogress)
progress_report(true);
diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl
index 18d0882cb1..ae74828e44 100644
--- a/src/bin/pg_resetwal/t/001_basic.pl
+++ b/src/bin/pg_resetwal/t/001_basic.pl
@@ -206,7 +206,7 @@ push @cmd,
sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1]));
@files = get_slru_files('pg_multixact/offsets');
-$mult = 32 * $blcksz / 4;
+$mult = 32 * ($blcksz - 24) / 4;
# -m argument is "new,old"
push @cmd, '-m',
sprintf("%d,%d",
@@ -214,11 +214,11 @@ push @cmd, '-m',
hex($files[0]) == 0 ? 1 : hex($files[0] * $mult));
@files = get_slru_files('pg_multixact/members');
-$mult = 32 * int($blcksz / 20) * 4;
+$mult = 32 * int(($blcksz - 24) / 20) * 4;
push @cmd, '-O', (hex($files[-1]) + 1) * $mult;
@files = get_slru_files('pg_xact');
-$mult = 32 * $blcksz * 4;
+$mult = 32 * ($blcksz - 24) * 4;
push @cmd,
'-u', (hex($files[0]) == 0 ? 3 : hex($files[0]) * $mult),
'-x', ((hex($files[-1]) + 1) * $mult);
diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile
index bde91e2beb..02438f5884 100644
--- a/src/bin/pg_upgrade/Makefile
+++ b/src/bin/pg_upgrade/Makefile
@@ -24,6 +24,7 @@ OBJS = \
pg_upgrade.o \
relfilenumber.o \
server.o \
+ slru.o \
tablespace.o \
util.o \
version.o
diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build
index 3e8a08e062..d216731864 100644
--- a/src/bin/pg_upgrade/meson.build
+++ b/src/bin/pg_upgrade/meson.build
@@ -13,6 +13,7 @@ pg_upgrade_sources = files(
'pg_upgrade.c',
'relfilenumber.c',
'server.c',
+ 'slru.c',
'tablespace.c',
'util.c',
'version.c',
diff --git a/src/bin/pg_upgrade/nls.mk b/src/bin/pg_upgrade/nls.mk
index 9e2c1386e2..f5ad9d1397 100644
--- a/src/bin/pg_upgrade/nls.mk
+++ b/src/bin/pg_upgrade/nls.mk
@@ -11,6 +11,7 @@ GETTEXT_FILES = check.c \
parallel.c \
pg_upgrade.c \
relfilenumber.c \
+ slru.c \
server.c \
tablespace.c \
util.c \
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 3960af4036..2b7d01058e 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -660,14 +660,23 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir)
static void
copy_xact_xlog_xid(void)
{
+ bool slru_header_changed = false;
+
/*
* Copy old commit logs to new data dir. pg_clog has been renamed to
* pg_xact in post-10 clusters.
*/
- copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact",
- GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact");
+ char *xact_old_dir = GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+ char *xact_new_dir = GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+
+ if (new_cluster.controldata.cat_ver >= SLRU_PAGE_HEADER_CAT_VER &&
+ old_cluster.controldata.cat_ver < SLRU_PAGE_HEADER_CAT_VER)
+ slru_header_changed = true;
+
+ if (slru_header_changed)
+ upgrade_xact_cache(xact_old_dir, xact_new_dir);
+ else
+ copy_subdir_files(xact_old_dir, xact_new_dir);
prep_status("Setting oldest XID for new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
@@ -702,7 +711,8 @@ copy_xact_xlog_xid(void)
* server doesn't attempt to read multis older than the cutoff value.
*/
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
- new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
+ !slru_header_changed)
{
copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
copy_subdir_files("pg_multixact/members", "pg_multixact/members");
@@ -722,7 +732,8 @@ copy_xact_xlog_xid(void)
new_cluster.pgdata);
check_ok();
}
- else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER ||
+ slru_header_changed)
{
/*
* Remove offsets/0000 file created by initdb that no longer matches
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index a710f325de..bb04224dee 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -115,6 +115,11 @@ extern char *output_files[];
*/
#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
+/*
+ * A page header was added to each SLRU page in 17.0.
+ */
+#define SLRU_PAGE_HEADER_CAT_VER 202312091
+
/*
* large object chunk size added to pg_controldata,
* commit 5f93c37805e7485488480916b4585e098d3cc883
@@ -454,6 +459,11 @@ uint32 get_major_server_version(ClusterInfo *cluster);
void check_pghost_envvar(void);
+/* slru.c */
+
+void upgrade_xact_cache(const char *src_subdir, const char *dst_subdir);
+
+
/* util.c */
char *quote_identifier(const char *s);
diff --git a/src/bin/pg_upgrade/slru.c b/src/bin/pg_upgrade/slru.c
new file mode 100644
index 0000000000..3d23175640
--- /dev/null
+++ b/src/bin/pg_upgrade/slru.c
@@ -0,0 +1,205 @@
+/*
+ * slru.c
+ *
+ * SLRU functions
+ *
+ * Copyright (c) 2010-2023, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/slru.c
+ */
+
+#include <dirent.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "postgres_fe.h"
+#include "storage/bufpage.h"
+#include "storage/checksum.h"
+
+#include "pg_upgrade.h"
+
+
+/*
+ * Copy SLRU_PAGES_PER_SEGMENT from access/slru.h to avoid including it.
+ */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+#define SEGMENT_SIZE (BLCKSZ * SLRU_PAGES_PER_SEGMENT)
+
+/*
+ * Copy PageInitSLRU from storage/bufpage.c to avoid linking to the backend.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
+
+/*
+ * Filter function for scandir(3) to select only segment files.
+ */
+static int
+segment_file_filter(const struct dirent *dirent)
+{
+ return strspn(dirent->d_name, "0123456789ABCDEF") == strlen(dirent->d_name);
+}
+
+/*
+ * Compare function for scandir(3) to sort segment files.
+ */
+static int
+segment_file_compare(const struct dirent **a, const struct dirent **b)
+{
+ long segno_a;
+ long segno_b;
+
+ segno_a = strtol((*a)->d_name, NULL, 16);
+ segno_b = strtol((*b)->d_name, NULL, 16);
+ return segno_a - segno_b;
+}
+
+static void
+upgrade_file(const char *src_dir, const char *src_file, const char *dst_dir)
+{
+ char src[MAXPGPATH];
+ char dst[MAXPGPATH];
+
+ int seg_name_len;
+ int src_segno;
+ int64 src_pageno;
+ int dst_segno;
+ int64 dst_pageno;
+ int dst_offset;
+
+ int src_fd;
+ int dst_fd;
+
+ char *src_buf;
+ ssize_t src_len;
+ ssize_t src_buf_offset;
+ PGAlignedBlock dst_block;
+ Page page = dst_block.data;
+ int len_to_copy;
+
+ seg_name_len = strlen(src_file);
+ src_segno = (int) strtol(src_file, NULL, 16);
+ src_pageno = src_segno * SLRU_PAGES_PER_SEGMENT;
+
+ dst_pageno = src_pageno * BLCKSZ / SizeOfPageContents;
+ dst_offset = src_pageno * BLCKSZ - dst_pageno * SizeOfPageContents;
+ dst_segno = dst_pageno / SLRU_PAGES_PER_SEGMENT;
+
+ snprintf(src, sizeof(src), "%s/%s", src_dir, src_file);
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+
+ src_buf = pg_malloc(SEGMENT_SIZE);
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) == -1)
+ pg_fatal("could not open file \"%s\": %s", src, strerror(errno));
+ if ((src_len = read(src_fd, src_buf, SEGMENT_SIZE)) == -1)
+ pg_fatal("could not read file \"%s\": %s", src, strerror(errno));
+
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+
+ /*
+ * Read the destination page at dst_pageno into the buffer. The page may contain
+ * data from the previous source segment. Initialize the page if the page is new.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+ if (read(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not read file \"%s\": %s", dst, strerror(errno));
+ if (PageIsNew(page))
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Rewind the file position, so the first write will overwrite the page.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+
+ src_buf_offset = 0;
+ while (src_buf_offset < src_len)
+ {
+ len_to_copy = Min(src_len - src_buf_offset, SizeOfPageContents - dst_offset);
+ memcpy(PageGetContents(page) + dst_offset, src_buf + src_buf_offset, len_to_copy);
+ src_buf_offset += len_to_copy;
+
+ if (new_cluster.controldata.data_checksum_version > 0)
+ ((PageHeader) page)->pd_checksum = pg_checksum_page(page, dst_pageno);
+ if (write(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not write file \"%s\": %s", dst, strerror(errno));
+
+ dst_pageno++;
+ dst_offset = 0;
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Switch segments if we reached the end of the current segment.
+ */
+ if (dst_pageno % SLRU_PAGES_PER_SEGMENT == 0)
+ {
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ dst_segno++;
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+ }
+ }
+
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ pg_free(src_buf);
+ close(src_fd);
+}
+
+void
+upgrade_xact_cache(const char *src_subdir, const char *dst_subdir)
+{
+ char src_dir[MAXPGPATH];
+ char dst_dir[MAXPGPATH];
+
+ struct dirent **src_segment_files;
+ int num_files;
+ int i;
+
+ snprintf(src_dir, sizeof(src_dir), "%s/%s", old_cluster.pgdata, src_subdir);
+ snprintf(dst_dir, sizeof(dst_dir), "%s/%s", new_cluster.pgdata, dst_subdir);
+
+ num_files = scandir(src_dir, &src_segment_files, segment_file_filter, segment_file_compare);
+ if (num_files == -1)
+ pg_fatal("could not scan directory \"%s\": %s", src_dir, strerror(errno));
+
+ for (i = 0; i < num_files; i++)
+ {
+ upgrade_file(src_dir, src_segment_files[i]->d_name, dst_dir);
+ free(src_segment_files[i]);
+ }
+ free(src_segment_files);
+}
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 091e2202c9..24733166b8 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -68,17 +68,6 @@ typedef struct SlruSharedData
int *page_lru_count;
LWLockPadded *buffer_locks;
- /*
- * Optional array of WAL flush LSNs associated with entries in the SLRU
- * pages. If not zero/NULL, we must flush WAL before writing pages (true
- * for pg_xact, false for multixact, pg_subtrans, pg_notify). group_lsn[]
- * has lsn_groups_per_page entries per buffer slot, each containing the
- * highest LSN known for a contiguous group of SLRU entries on that slot's
- * page.
- */
- XLogRecPtr *group_lsn;
- int lsn_groups_per_page;
-
/*----------
* We mark a page "most recently used" by setting
* page_lru_count[slotno] = ++cur_lru_count;
@@ -147,8 +136,8 @@ typedef struct SlruCtlData
typedef SlruCtlData *SlruCtl;
-extern Size SimpleLruShmemSize(int nslots, int nlsns);
-extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+extern Size SimpleLruShmemSize(int nslots);
+extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler,
bool long_segment_names);
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index c4c59bfe6f..ec823e6d1a 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202311271
+#define CATALOG_VERSION_NO 202312091
#endif
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 424ecba028..251d9523fa 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -201,6 +201,7 @@ typedef PageHeaderData *PageHeader;
* handling pages.
*/
#define PG_PAGE_LAYOUT_VERSION 4
+#define PG_SLRU_PAGE_LAYOUT_VERSION 1
#define PG_DATA_CHECKSUM_VERSION 1
/* ----------------------------------------------------------------
@@ -257,6 +258,11 @@ PageGetContents(Page page)
return (char *) page + MAXALIGN(SizeOfPageHeaderData);
}
+/*
+ * Space available for storing page contents.
+ */
+#define SizeOfPageContents (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
/* ----------------
* functions to access page size info
* ----------------
@@ -486,6 +492,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
"BLCKSZ has to be a multiple of sizeof(size_t)");
extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern void PageInitSLRU(Page page, Size pageSize, Size specialSize);
extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags);
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
OffsetNumber offsetNumber, int flags);
diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c
index d0fb9444e8..aa8a29d92e 100644
--- a/src/test/modules/test_slru/test_slru.c
+++ b/src/test/modules/test_slru/test_slru.c
@@ -17,6 +17,7 @@
#include "access/slru.h"
#include "access/transam.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
@@ -76,8 +77,8 @@ test_slru_page_write(PG_FUNCTION_ARGS)
TestSlruCtl->shared->page_status[slotno] = SLRU_PAGE_VALID;
/* write given data to the page, up to the limit of the page */
- strncpy(TestSlruCtl->shared->page_buffer[slotno], data,
- BLCKSZ - 1);
+ strncpy(PageGetContents(TestSlruCtl->shared->page_buffer[slotno]), data,
+ SizeOfPageContents - 1);
SimpleLruWritePage(TestSlruCtl, slotno);
LWLockRelease(TestSLRULock);
@@ -104,7 +105,7 @@ test_slru_page_read(PG_FUNCTION_ARGS)
LWLockAcquire(TestSLRULock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(TestSlruCtl, pageno,
write_ok, InvalidTransactionId);
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(TestSLRULock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -122,7 +123,7 @@ test_slru_page_readonly(PG_FUNCTION_ARGS)
pageno,
InvalidTransactionId);
Assert(LWLockHeldByMe(TestSLRULock));
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(TestSLRULock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -202,7 +203,7 @@ test_slru_shmem_request(void)
prev_shmem_request_hook();
/* reserve shared memory for the test SLRU */
- RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS, 0));
+ RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS));
}
static bool
@@ -238,7 +239,7 @@ test_slru_shmem_startup(void)
TestSlruCtl->PagePrecedes = test_slru_page_precedes_logically;
SimpleLruInit(TestSlruCtl, "TestSLRU",
- NUM_TEST_BUFFERS, 0, TestSLRULock, slru_dir_name,
+ NUM_TEST_BUFFERS, TestSLRULock, slru_dir_name,
test_tranche_id, SYNC_HANDLER_NONE, long_segment_names);
}
On Thu, Dec 8, 2023 at 1:36 AM Li, Yong <yoli@ebay.com> wrote:
Given so many different approaches were discussed, I have started a
wiki to record and collaborate all efforts towards SLRU
improvements. The wiki provides a concise overview of all the ideas
discussed and can serve as a portal for all historical
discussions. Currently, the wiki summarizes four recent threads
ranging from identifier format change to page header change, to moving
SLRU into the main buffer pool, to reduce lock contention on SLRU
latches. We can keep the patch related discussions in this thread and
use the wiki as a live document for larger scale collaborations.
The wiki page is
here: https://wiki.postgresql.org/wiki/SLRU_improvements
Regarding the benefits of this patch, here is a detailed explanation:
1. Checksum is added to each page, allowing us to verify if a page has
been corrupted when read from the disk.
2. The ad-hoc LSN group structure is removed from the SLRU cache
control data and is replaced by the page LSN in the page header.
This allows us to use the same WAL protocol as used by pages in the
main buffer pool: flush all redo logs up to the page LSN before
flushing the page itself. If we move SLRU caches into the main
buffer pool, this change fits naturally.
3. It leaves further optimizations open. We can continue to pursue the
goal of moving SLRU into the main buffer pool, or we can follow the
lock partition idea. This change by itself does not conflict with
either proposal.
Also, the patch is now complete and is ready for review. All check-
world tests including tap tests passed with this patch.
Hi Yong,
I agree we should break the effort for the SLRU optimization into
smaller chunks after having worked on some of the bigger patches and
facing difficulty in making progress that way.
The patch looks mostly good to me; though one thing that I thought about
differently with the upgrade portion is where we should keep the logic
of re-writing the CLOG files.
There is a precedent introduced back in Postgres v9.6 in making on disk
page format changes across different in visibility map: [1]https://github.com/postgres/postgres/commit/7087166a88fe0c04fc6636d0d6d6bea1737fc1fb
code comment:
* In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's
* visibility map included one bit per heap page; it now includes two.
* When upgrading a cluster from before that time to a current PostgreSQL
* version, we could refuse to copy visibility maps from the old cluster
* to the new cluster; the next VACUUM would recreate them, but at the
* price of scanning the entire table. So, instead, we rewrite the old
* visibility maps in the new format.
This work is being done in file.c – it seems to me the proper way to
proceed would be to continue writing on-disk upgrade logic here.
Besides that this looks good to me, would like to hear what others have to say.
Thanks,
Rishu Bagga
Amazon Web Services (AWS)
[1]: https://github.com/postgres/postgres/commit/7087166a88fe0c04fc6636d0d6d6bea1737fc1fb
This work is being done in file.c – it seems to me the proper way to
proceed would be to continue writing on-disk upgrade logic here.
Besides that this looks good to me, would like to hear what others have to say.
Thank you, Rishu for taking time to review the code. I've updated the patch
and moved the on-disk upgrade logic to pg_upgrade/file.c.
I have also added this thread to the current Commitfest and hope this patch
will be part of the 17 release.
The commitfest link:
https://commitfest.postgresql.org/46/4709/
Regards,
Yong,
Attachments:
slur_page_header_v2.patchapplication/octet-stream; name=slur_page_header_v2.patchDownload
src/backend/access/transam/clog.c | 52 +++++----
src/backend/access/transam/commit_ts.c | 26 +++--
src/backend/access/transam/multixact.c | 63 ++++++-----
src/backend/access/transam/slru.c | 113 +++++++-------------
src/backend/access/transam/subtrans.c | 12 +--
src/backend/commands/async.c | 27 ++---
src/backend/storage/lmgr/predicate.c | 16 +--
src/backend/storage/page/bufpage.c | 25 +++++
src/bin/pg_checksums/pg_checksums.c | 9 ++
src/bin/pg_resetwal/t/001_basic.pl | 6 +-
src/bin/pg_upgrade/file.c | 186 +++++++++++++++++++++++++++++++++
src/bin/pg_upgrade/pg_upgrade.c | 23 ++--
src/bin/pg_upgrade/pg_upgrade.h | 6 ++
src/include/access/slru.h | 15 +--
src/include/catalog/catversion.h | 2 +-
src/include/storage/bufpage.h | 7 ++
src/test/modules/test_slru/test_slru.c | 13 +--
17 files changed, 409 insertions(+), 192 deletions(-)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 7dca1df61b..b7f690754b 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -41,6 +41,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/proc.h"
#include "storage/sync.h"
@@ -59,7 +60,7 @@
/* We need two bits per xact, so four xacts fit in a byte */
#define CLOG_BITS_PER_XACT 2
#define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACTS_PER_PAGE (SizeOfPageContents * CLOG_XACTS_PER_BYTE)
#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
@@ -77,13 +78,6 @@ TransactionIdToPage(TransactionId xid)
#define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
#define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
-/* We store the latest async LSN for each group of transactions */
-#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
-#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
-
-#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
- ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
-
/*
* The number of subtransactions below which we consider to apply clog group
* update optimization. Testing reveals that the number higher than this can
@@ -101,7 +95,7 @@ static SlruCtlData XactCtlData;
static int ZeroCLOGPage(int64 pageno, bool writeXlog);
static bool CLOGPagePrecedes(int64 page1, int64 page2);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
Oid oldestXactDb);
static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
@@ -583,8 +577,9 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
char *byteptr;
char byteval;
char curval;
+ Page page = XactCtl->shared->page_buffer[slotno];
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(page) + byteno;
curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
/*
@@ -613,7 +608,7 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*byteptr = byteval;
/*
- * Update the group LSN if the transaction completion LSN is higher.
+ * Update the page LSN if the transaction completion LSN is higher.
*
* Note: lsn will be invalid when supplied during InRecovery processing,
* so we don't need to do anything special to avoid LSN updates during
@@ -622,10 +617,8 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*/
if (!XLogRecPtrIsInvalid(lsn))
{
- int lsnindex = GetLSNIndex(slotno, xid);
-
- if (XactCtl->shared->group_lsn[lsnindex] < lsn)
- XactCtl->shared->group_lsn[lsnindex] = lsn;
+ if (PageGetLSN(page) < lsn)
+ PageSetLSN(page, lsn);
}
}
@@ -651,19 +644,19 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
int byteno = TransactionIdToByte(xid);
int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
int slotno;
- int lsnindex;
+ Page page;
char *byteptr;
XidStatus status;
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
- lsnindex = GetLSNIndex(slotno, xid);
- *lsn = XactCtl->shared->group_lsn[lsnindex];
+ *lsn = PageGetLSN(page);
LWLockRelease(XactSLRULock);
@@ -698,14 +691,14 @@ CLOGShmemBuffers(void)
Size
CLOGShmemSize(void)
{
- return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
+ return SimpleLruShmemSize(CLOGShmemBuffers());
}
void
CLOGShmemInit(void)
{
XactCtl->PagePrecedes = CLOGPagePrecedes;
- SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
+ SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(),
XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
SYNC_HANDLER_CLOG, false);
SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
@@ -747,11 +740,17 @@ static int
ZeroCLOGPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(XactCtl, pageno);
+ page = XactCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -807,12 +806,12 @@ TrimCLOG(void)
char *byteptr;
slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(XactCtl->shared->page_buffer[slotno]) + byteno;
/* Zero so-far-unused positions in the current byte */
*byteptr &= (1 << bshift) - 1;
/* Zero the rest of the page */
- MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+ MemSet(byteptr + 1, 0, SizeOfPageContents - byteno - 1);
XactCtl->shared->page_dirty[slotno] = true;
}
@@ -836,7 +835,6 @@ CheckPointCLOG(void)
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that CLOG has room for a newly-allocated XID.
*
@@ -958,12 +956,12 @@ CLOGPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
+ return XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index e6fd9b3349..367459ea04 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -31,6 +31,7 @@
#include "funcapi.h"
#include "miscadmin.h"
#include "pg_trace.h"
+#include "storage/bufpage.h"
#include "storage/shmem.h"
#include "utils/builtins.h"
#include "utils/snapmgr.h"
@@ -63,7 +64,7 @@ typedef struct CommitTimestampEntry
sizeof(RepOriginId))
#define COMMIT_TS_XACTS_PER_PAGE \
- (BLCKSZ / SizeOfCommitTimestampEntry)
+ (SizeOfPageContents / SizeOfCommitTimestampEntry)
/*
@@ -120,7 +121,7 @@ static int ZeroCommitTsPage(int64 pageno, bool writeXlog);
static bool CommitTsPagePrecedes(int64 page1, int64 page2);
static void ActivateCommitTs(void);
static void DeactivateCommitTs(void);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid);
/*
@@ -254,11 +255,12 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
CommitTimestampEntry entry;
Assert(TransactionIdIsNormal(xid));
+ Assert(xid == slotno * COMMIT_TS_XACTS_PER_PAGE + entryno);
entry.time = ts;
entry.nodeid = nodeid;
- memcpy(CommitTsCtl->shared->page_buffer[slotno] +
+ memcpy(PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
&entry, SizeOfCommitTimestampEntry);
}
@@ -337,7 +339,7 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
memcpy(&entry,
- CommitTsCtl->shared->page_buffer[slotno] +
+ PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
SizeOfCommitTimestampEntry);
@@ -515,7 +517,7 @@ CommitTsShmemBuffers(void)
Size
CommitTsShmemSize(void)
{
- return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
+ return SimpleLruShmemSize(CommitTsShmemBuffers()) +
sizeof(CommitTimestampShared);
}
@@ -529,7 +531,7 @@ CommitTsShmemInit(void)
bool found;
CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
- SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0,
+ SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(),
CommitTsSLRULock, "pg_commit_ts",
LWTRANCHE_COMMITTS_BUFFER,
SYNC_HANDLER_COMMIT_TS,
@@ -582,11 +584,17 @@ static int
ZeroCommitTsPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+ page = CommitTsCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -946,12 +954,12 @@ CommitTsPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
+ return XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index db3423f12e..0970aa8855 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -83,6 +83,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "postmaster/autovacuum.h"
+#include "storage/bufpage.h"
#include "storage/lmgr.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
@@ -106,7 +107,7 @@
*/
/* We need four bytes per offset */
-#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+#define MULTIXACT_OFFSETS_PER_PAGE (SizeOfPageContents / sizeof(MultiXactOffset))
#define MultiXactIdToOffsetPage(xid) \
((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
@@ -119,8 +120,8 @@
* additional flag bits for each TransactionId. To do this without getting
* into alignment issues, we store four bytes of flags, and then the
* corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
- * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
- * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 408 groups
+ * per page. This wastes 8 bytes per page, but that's OK -- simplicity (and
* performance) trumps space efficiency here.
*
* Note that the "offset" macros work with byte offset, not array indexes, so
@@ -138,7 +139,7 @@
/* size in bytes of a complete group */
#define MULTIXACT_MEMBERGROUP_SIZE \
(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
-#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (SizeOfPageContents / MULTIXACT_MEMBERGROUP_SIZE)
#define MULTIXACT_MEMBERS_PER_PAGE \
(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
@@ -366,7 +367,7 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
MultiXactOffset start, uint32 distance);
static bool SetOffsetVacuumLimit(bool is_startup);
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
-static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
+static XLogRecPtr WriteMZeroPageXlogRec(int64 pageno, uint8 info);
static void WriteMTruncateXlogRec(Oid oldestMultiDB,
MultiXactId startTruncOff,
MultiXactId endTruncOff,
@@ -884,7 +885,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
* take the trouble to generalize the slru.c error reporting code.
*/
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
*offptr = offset;
@@ -921,12 +922,12 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
}
memberptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
*memberptr = members[i].xid;
flagsptr = (uint32 *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
flagsval = *flagsptr;
flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
@@ -1348,7 +1349,7 @@ retry:
entryno = MultiXactIdToOffsetEntry(multi);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
@@ -1381,7 +1382,7 @@ retry:
if (pageno != prev_pageno)
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
nextMXOffset = *offptr;
@@ -1424,7 +1425,7 @@ retry:
}
xactptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
if (!TransactionIdIsValid(*xactptr))
{
@@ -1435,7 +1436,7 @@ retry:
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
- flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ flagsptr = (uint32 *) (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
ptr[truelength].xid = *xactptr;
ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
@@ -1834,8 +1835,8 @@ MultiXactShmemSize(void)
mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
size = SHARED_MULTIXACT_STATE_SIZE;
- size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0));
- size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS));
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS));
return size;
}
@@ -1851,14 +1852,14 @@ MultiXactShmemInit(void)
MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
SimpleLruInit(MultiXactOffsetCtl,
- "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
+ "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS,
MultiXactOffsetSLRULock, "pg_multixact/offsets",
LWTRANCHE_MULTIXACTOFFSET_BUFFER,
SYNC_HANDLER_MULTIXACT_OFFSET,
false);
SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
SimpleLruInit(MultiXactMemberCtl,
- "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
+ "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS,
MultiXactMemberSLRULock, "pg_multixact/members",
LWTRANCHE_MULTIXACTMEMBER_BUFFER,
SYNC_HANDLER_MULTIXACT_MEMBER,
@@ -1933,11 +1934,17 @@ static int
ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+ page = MultiXactOffsetCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -1949,11 +1956,17 @@ static int
ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
+ page = MultiXactMemberCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2071,10 +2084,10 @@ TrimMultiXact(void)
MultiXactOffset *offptr;
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
- MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+ MemSet(offptr, 0, SizeOfPageContents - (entryno * sizeof(MultiXactOffset)));
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
}
@@ -2104,9 +2117,9 @@ TrimMultiXact(void)
memberoff = MXOffsetToMemberOffset(offset);
slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
xidptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
- MemSet(xidptr, 0, BLCKSZ - memberoff);
+ MemSet(xidptr, 0, SizeOfPageContents - memberoff);
/*
* Note: we don't need to zero out the flag bits in the remaining
@@ -2758,7 +2771,7 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
LWLockRelease(MultiXactOffsetSLRULock);
@@ -3192,12 +3205,12 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
* Write an xlog record reflecting the zeroing of either a MEMBERs or
* OFFSETs page (info shows which)
*/
-static void
+static XLogRecPtr
WriteMZeroPageXlogRec(int64 pageno, uint8 info)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_MULTIXACT_ID, info);
+ return XLogInsert(RM_MULTIXACT_ID, info);
}
/*
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 7a371d9034..d7f3deea7d 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -57,6 +57,7 @@
#include "access/xlogutils.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/shmem.h"
@@ -154,13 +155,13 @@ typedef enum
SLRU_WRITE_FAILED,
SLRU_FSYNC_FAILED,
SLRU_CLOSE_FAILED,
+ SLRU_DATA_CORRUPTED,
} SlruErrorCause;
static SlruErrorCause slru_errcause;
static int slru_errno;
-static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
@@ -179,7 +180,7 @@ static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
*/
Size
-SimpleLruShmemSize(int nslots, int nlsns)
+SimpleLruShmemSize(int nslots)
{
Size sz;
@@ -192,9 +193,6 @@ SimpleLruShmemSize(int nslots, int nlsns)
sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
- if (nlsns > 0)
- sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
-
return BUFFERALIGN(sz) + BLCKSZ * nslots;
}
@@ -204,14 +202,13 @@ SimpleLruShmemSize(int nslots, int nlsns)
* ctl: address of local (unshared) control structure.
* name: name of SLRU. (This is user-visible, pick with care!)
* nslots: number of page slots to use.
- * nlsns: number of LSN groups per page (set to zero if not relevant).
* ctllock: LWLock to use to control access to the shared control structure.
* subdir: PGDATA-relative subdirectory that will contain the files.
* tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks.
* sync_handler: which set of functions to use to handle sync requests
*/
void
-SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler, bool long_segment_names)
{
@@ -219,7 +216,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
bool found;
shared = (SlruShared) ShmemInitStruct(name,
- SimpleLruShmemSize(nslots, nlsns),
+ SimpleLruShmemSize(nslots),
&found);
if (!IsUnderPostmaster)
@@ -236,7 +233,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
shared->ControlLock = ctllock;
shared->num_slots = nslots;
- shared->lsn_groups_per_page = nlsns;
shared->cur_lru_count = 0;
@@ -261,12 +257,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
shared->buffer_locks = (LWLockPadded *) (ptr + offset);
offset += MAXALIGN(nslots * sizeof(LWLockPadded));
- if (nlsns > 0)
- {
- shared->group_lsn = (XLogRecPtr *) (ptr + offset);
- offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
- }
-
ptr += BUFFERALIGN(offset);
for (slotno = 0; slotno < nslots; slotno++)
{
@@ -281,7 +271,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
}
/* Should fit to estimated shmem size */
- Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
+ Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots));
}
else
Assert(found);
@@ -323,11 +313,8 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
shared->page_dirty[slotno] = true;
SlruRecentlyUsed(shared, slotno);
- /* Set the buffer to zeroes */
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
-
- /* Set the LSNs for this new page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
+ /* Initialize the page. */
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
/* Assume this page is now the latest active page */
shared->latest_page_number = pageno;
@@ -338,26 +325,6 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
return slotno;
}
-/*
- * Zero all the LSNs we store for this slru page.
- *
- * This should be called each time we create a new page, and each time we read
- * in a page from disk into an existing buffer. (Such an old page cannot
- * have any interesting LSNs, since we'd have flushed them before writing
- * the page in the first place.)
- *
- * This assumes that InvalidXLogRecPtr is bitwise-all-0.
- */
-static void
-SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
-{
- SlruShared shared = ctl->shared;
-
- if (shared->lsn_groups_per_page > 0)
- MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
- shared->lsn_groups_per_page * sizeof(XLogRecPtr));
-}
-
/*
* Wait for any active I/O on a page slot to finish. (This does not
* guarantee that new I/O hasn't been started before we return, though.
@@ -478,9 +445,6 @@ SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
/* Do the read */
ok = SlruPhysicalReadPage(ctl, pageno, slotno);
- /* Set the LSNs for this newly read-in page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
-
/* Re-acquire control lock and update page state */
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
@@ -740,7 +704,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
ereport(LOG,
(errmsg("file \"%s\" doesn't exist, reading as zeroes",
path)));
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
return true;
}
@@ -763,6 +727,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
return false;
}
+ if (!PageIsVerifiedExtended(shared->page_buffer[slotno], pageno, PIV_REPORT_STAT))
+ {
+ slru_errcause = SLRU_DATA_CORRUPTED;
+ slru_errno = 0;
+ return false;
+ }
+
return true;
}
@@ -789,6 +760,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd = -1;
+ Page page = shared->page_buffer[slotno];
+ XLogRecPtr lsn;
/* update the stats counter of written pages */
pgstat_count_slru_page_written(shared->slru_stats_idx);
@@ -798,41 +771,18 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
* write out data before associated WAL records. This is the same action
* performed during FlushBuffer() in the main buffer manager.
*/
- if (shared->group_lsn != NULL)
+ lsn = PageGetLSN(page);
+ if (!XLogRecPtrIsInvalid(lsn))
{
/*
- * We must determine the largest async-commit LSN for the page. This
- * is a bit tedious, but since this entire function is a slow path
- * anyway, it seems better to do this here than to maintain a per-page
- * LSN variable (which'd need an extra comparison in the
- * transaction-commit path).
+ * As noted above, elog(ERROR) is not acceptable here, so if
+ * XLogFlush were to fail, we must PANIC. This isn't much of a
+ * restriction because XLogFlush is just about all critical
+ * section anyway, but let's make sure.
*/
- XLogRecPtr max_lsn;
- int lsnindex,
- lsnoff;
-
- lsnindex = slotno * shared->lsn_groups_per_page;
- max_lsn = shared->group_lsn[lsnindex++];
- for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
- {
- XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
-
- if (max_lsn < this_lsn)
- max_lsn = this_lsn;
- }
-
- if (!XLogRecPtrIsInvalid(max_lsn))
- {
- /*
- * As noted above, elog(ERROR) is not acceptable here, so if
- * XLogFlush were to fail, we must PANIC. This isn't much of a
- * restriction because XLogFlush is just about all critical
- * section anyway, but let's make sure.
- */
- START_CRIT_SECTION();
- XLogFlush(max_lsn);
- END_CRIT_SECTION();
- }
+ START_CRIT_SECTION();
+ XLogFlush(lsn);
+ END_CRIT_SECTION();
}
/*
@@ -899,6 +849,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
}
}
+ PageSetChecksumInplace(shared->page_buffer[slotno], pageno);
+
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
@@ -1019,6 +971,13 @@ SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
errdetail("Could not close file \"%s\": %m.",
path)));
break;
+ case SLRU_DATA_CORRUPTED:
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Invalid page from file \"%s\" at offset %d.",
+ path, offset)));
+ break;
default:
/* can't get here, we trust */
elog(ERROR, "unrecognized SimpleLru error cause: %d",
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 1b3b3ad720..1f35ee23c6 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -32,6 +32,7 @@
#include "access/subtrans.h"
#include "access/transam.h"
#include "pg_trace.h"
+#include "storage/bufpage.h"
#include "utils/snapmgr.h"
@@ -49,7 +50,7 @@
*/
/* We need four bytes per xact */
-#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+#define SUBTRANS_XACTS_PER_PAGE (SizeOfPageContents / sizeof(TransactionId))
/*
* Although we return an int64 the actual value can't currently exceed
@@ -93,7 +94,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
/*
@@ -133,7 +134,7 @@ SubTransGetParent(TransactionId xid)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
parent = *ptr;
@@ -193,14 +194,14 @@ SubTransGetTopmostTransaction(TransactionId xid)
Size
SUBTRANSShmemSize(void)
{
- return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
+ return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS);
}
void
SUBTRANSShmemInit(void)
{
SubTransCtl->PagePrecedes = SubTransPagePrecedes;
- SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
+ SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS,
SubtransSLRULock, "pg_subtrans",
LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE,
false);
@@ -305,7 +306,6 @@ CheckPointSUBTRANS(void)
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that SUBTRANS has room for a newly-allocated XID.
*
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 264f25a8f9..85b0d63cdc 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -140,6 +140,7 @@
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
@@ -162,7 +163,7 @@
* than that, so changes in that data structure won't affect user-visible
* restrictions.
*/
-#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128)
+#define NOTIFY_PAYLOAD_MAX_LENGTH (SizeOfPageContents - NAMEDATALEN - 128)
/*
* Struct representing an entry in the global notify queue
@@ -311,7 +312,7 @@ static SlruCtlData NotifyCtlData;
#define NotifyCtl (&NotifyCtlData)
#define QUEUE_PAGESIZE BLCKSZ
-
+#define QUEUE_PAGE_CAPACITY (QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData))
#define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */
/*
@@ -492,7 +493,7 @@ AsyncShmemSize(void)
size = mul_size(MaxBackends + 1, sizeof(QueueBackendStatus));
size = add_size(size, offsetof(AsyncQueueControl, backend));
- size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS));
return size;
}
@@ -541,7 +542,7 @@ AsyncShmemInit(void)
* names are used in order to avoid wraparound.
*/
NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
- SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0,
+ SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS,
NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER,
SYNC_HANDLER_NONE, true);
@@ -1301,14 +1302,14 @@ asyncQueueAdvance(volatile QueuePosition *position, int entryLength)
* written or read.
*/
offset += entryLength;
- Assert(offset <= QUEUE_PAGESIZE);
+ Assert(offset <= QUEUE_PAGE_CAPACITY);
/*
* In a second step check if another entry can possibly be written to the
* page. If so, stay here, we have reached the next position. If not, then
* we need to move on to the next page.
*/
- if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGESIZE)
+ if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGE_CAPACITY)
{
pageno++;
offset = 0;
@@ -1408,7 +1409,7 @@ asyncQueueAddEntries(ListCell *nextNotify)
offset = QUEUE_POS_OFFSET(queue_head);
/* Check whether the entry really fits on the current page */
- if (offset + qe.length <= QUEUE_PAGESIZE)
+ if (offset + qe.length <= QUEUE_PAGE_CAPACITY)
{
/* OK, so advance nextNotify past this item */
nextNotify = lnext(pendingNotifies->events, nextNotify);
@@ -1420,14 +1421,14 @@ asyncQueueAddEntries(ListCell *nextNotify)
* only check dboid and since it won't match any reader's database
* OID, they will ignore this entry and move on.
*/
- qe.length = QUEUE_PAGESIZE - offset;
+ qe.length = QUEUE_PAGE_CAPACITY - offset;
qe.dboid = InvalidOid;
qe.data[0] = '\0'; /* empty channel */
qe.data[1] = '\0'; /* empty payload */
}
/* Now copy qe into the shared buffer page */
- memcpy(NotifyCtl->shared->page_buffer[slotno] + offset,
+ memcpy(PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + offset,
&qe,
qe.length);
@@ -1947,10 +1948,10 @@ asyncQueueReadAllNotifications(void)
else
{
/* fetch all the rest of the page */
- copysize = QUEUE_PAGESIZE - curoffset;
+ copysize = QUEUE_PAGE_CAPACITY - curoffset;
}
- memcpy(page_buffer.buf + curoffset,
- NotifyCtl->shared->page_buffer[slotno] + curoffset,
+ memcpy(PageGetContents(page_buffer.buf) + curoffset,
+ PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + curoffset,
copysize);
/* Release lock that we got from SimpleLruReadPage_ReadOnly() */
LWLockRelease(NotifySLRULock);
@@ -2021,7 +2022,7 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current,
if (QUEUE_POS_EQUAL(thisentry, stop))
break;
- qe = (AsyncQueueEntry *) (page_buffer + QUEUE_POS_OFFSET(thisentry));
+ qe = (AsyncQueueEntry *) (PageGetContents(page_buffer) + QUEUE_POS_OFFSET(thisentry));
/*
* Advance *current over this message, possibly to the next page. As
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 1129b8e4f2..9fd945ad38 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -204,6 +204,7 @@
#include "pgstat.h"
#include "port/pg_lfind.h"
#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
#include "storage/predicate.h"
#include "storage/predicate_internals.h"
#include "storage/proc.h"
@@ -322,8 +323,8 @@ static SlruCtlData SerialSlruCtlData;
#define SerialSlruCtl (&SerialSlruCtlData)
#define SERIAL_PAGESIZE BLCKSZ
-#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
-#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE - MAXALIGN(SizeOfPageHeaderData) / SERIAL_ENTRYSIZE)
/*
* Set maximum pages based on the number needed to track all transactions.
@@ -333,7 +334,7 @@ static SlruCtlData SerialSlruCtlData;
#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
- (SerialSlruCtl->shared->page_buffer[slotno] + \
+ (PageGetContents(SerialSlruCtl->shared->page_buffer[slotno]) + \
((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
@@ -785,10 +786,13 @@ SerialPagePrecedesLogicallyUnitTests(void)
* requires burning ~2B XIDs in single-user mode, a negligible
* possibility. Moreover, if it does happen, the consequence would be
* mild, namely a new transaction failing in SimpleLruReadPage().
+ *
+ * NOTE: After adding the page header, the defect affects two pages.
+ * We now assert correct treatment of its second to prior page.
*/
headPage = oldestPage;
targetPage = newestPage;
- Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+ Assert(SerialPagePrecedesLogically(headPage, targetPage - 2));
#if 0
Assert(SerialPagePrecedesLogically(headPage, targetPage));
#endif
@@ -808,7 +812,7 @@ SerialInit(void)
*/
SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
SimpleLruInit(SerialSlruCtl, "Serial",
- NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
+ NUM_SERIAL_BUFFERS, SerialSLRULock, "pg_serial",
LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE,
false);
#ifdef USE_ASSERT_CHECKING
@@ -1348,7 +1352,7 @@ PredicateLockShmemSize(void)
/* Shared memory structures for SLRU tracking of old committed xids. */
size = add_size(size, sizeof(SerialControlData));
- size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS));
return size;
}
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 9a302ddc30..723a127594 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -59,6 +59,31 @@ PageInit(Page page, Size pageSize, Size specialSize)
/* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
}
+/*
+ * PageInitSLRU
+ * Initializes the contents of an SLRU page.
+ * Note that we don't calculate an initial checksum here; that's not done
+ * until it's time to write.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
/*
* PageIsVerifiedExtended
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 19c083be17..0f43533811 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -16,6 +16,7 @@
#include <dirent.h>
#include <limits.h>
+#include <stdbool.h>
#include <time.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -589,12 +590,20 @@ main(int argc, char *argv[])
{
total_size = scan_directory(DataDir, "global", true);
total_size += scan_directory(DataDir, "base", true);
+ total_size += scan_directory(DataDir, "pg_commit_ts", true);
+ total_size += scan_directory(DataDir, "pg_multixact", true);
+ total_size += scan_directory(DataDir, "pg_serial", true);
total_size += scan_directory(DataDir, "pg_tblspc", true);
+ total_size += scan_directory(DataDir, "pg_xact", true);
}
(void) scan_directory(DataDir, "global", false);
(void) scan_directory(DataDir, "base", false);
+ (void) scan_directory(DataDir, "pg_commit_ts", false);
+ (void) scan_directory(DataDir, "pg_multixact", false);
+ (void) scan_directory(DataDir, "pg_serial", false);
(void) scan_directory(DataDir, "pg_tblspc", false);
+ (void) scan_directory(DataDir, "pg_xact", false);
if (showprogress)
progress_report(true);
diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl
index 18d0882cb1..ae74828e44 100644
--- a/src/bin/pg_resetwal/t/001_basic.pl
+++ b/src/bin/pg_resetwal/t/001_basic.pl
@@ -206,7 +206,7 @@ push @cmd,
sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1]));
@files = get_slru_files('pg_multixact/offsets');
-$mult = 32 * $blcksz / 4;
+$mult = 32 * ($blcksz - 24) / 4;
# -m argument is "new,old"
push @cmd, '-m',
sprintf("%d,%d",
@@ -214,11 +214,11 @@ push @cmd, '-m',
hex($files[0]) == 0 ? 1 : hex($files[0] * $mult));
@files = get_slru_files('pg_multixact/members');
-$mult = 32 * int($blcksz / 20) * 4;
+$mult = 32 * int(($blcksz - 24) / 20) * 4;
push @cmd, '-O', (hex($files[-1]) + 1) * $mult;
@files = get_slru_files('pg_xact');
-$mult = 32 * $blcksz * 4;
+$mult = 32 * ($blcksz - 24) * 4;
push @cmd,
'-u', (hex($files[0]) == 0 ? 3 : hex($files[0]) * $mult),
'-x', ((hex($files[-1]) + 1) * $mult);
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index d173602882..322c2b3ce4 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -9,6 +9,7 @@
#include "postgres_fe.h"
+#include <dirent.h>
#include <sys/stat.h>
#include <fcntl.h>
#ifdef HAVE_COPYFILE_H
@@ -375,3 +376,188 @@ check_hard_link(void)
unlink(new_link_file);
}
+
+
+/*
+ * Copy SLRU_PAGES_PER_SEGMENT from access/slru.h to avoid including it.
+ */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+#define SEGMENT_SIZE (BLCKSZ * SLRU_PAGES_PER_SEGMENT)
+
+/*
+ * Copy PageInitSLRU from storage/bufpage.c to avoid linking to the backend.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
+
+/*
+ * Filter function for scandir(3) to select only segment files.
+ */
+static int
+segment_file_filter(const struct dirent *dirent)
+{
+ return strspn(dirent->d_name, "0123456789ABCDEF") == strlen(dirent->d_name);
+}
+
+/*
+ * Compare function for scandir(3) to sort segment files.
+ */
+static int
+segment_file_compare(const struct dirent **a, const struct dirent **b)
+{
+ long segno_a;
+ long segno_b;
+
+ segno_a = strtol((*a)->d_name, NULL, 16);
+ segno_b = strtol((*b)->d_name, NULL, 16);
+ return segno_a - segno_b;
+}
+
+static void
+upgrade_file(const char *src_dir, const char *src_file, const char *dst_dir)
+{
+ char src[MAXPGPATH];
+ char dst[MAXPGPATH];
+
+ int seg_name_len;
+ int src_segno;
+ int64 src_pageno;
+ int dst_segno;
+ int64 dst_pageno;
+ int dst_offset;
+
+ int src_fd;
+ int dst_fd;
+
+ char *src_buf;
+ ssize_t src_len;
+ ssize_t src_buf_offset;
+ PGAlignedBlock dst_block;
+ Page page = dst_block.data;
+ int len_to_copy;
+
+ seg_name_len = strlen(src_file);
+ src_segno = (int) strtol(src_file, NULL, 16);
+ src_pageno = src_segno * SLRU_PAGES_PER_SEGMENT;
+
+ dst_pageno = src_pageno * BLCKSZ / SizeOfPageContents;
+ dst_offset = src_pageno * BLCKSZ - dst_pageno * SizeOfPageContents;
+ dst_segno = dst_pageno / SLRU_PAGES_PER_SEGMENT;
+
+ snprintf(src, sizeof(src), "%s/%s", src_dir, src_file);
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+
+ src_buf = pg_malloc(SEGMENT_SIZE);
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) == -1)
+ pg_fatal("could not open file \"%s\": %s", src, strerror(errno));
+ if ((src_len = read(src_fd, src_buf, SEGMENT_SIZE)) == -1)
+ pg_fatal("could not read file \"%s\": %s", src, strerror(errno));
+
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+
+ /*
+ * Read the destination page at dst_pageno into the buffer. The page may contain
+ * data from the previous source segment. Initialize the page if the page is new.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+ if (read(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not read file \"%s\": %s", dst, strerror(errno));
+ if (PageIsNew(page))
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Rewind the file position, so the first write will overwrite the page.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+
+ src_buf_offset = 0;
+ while (src_buf_offset < src_len)
+ {
+ len_to_copy = Min(src_len - src_buf_offset, SizeOfPageContents - dst_offset);
+ memcpy(PageGetContents(page) + dst_offset, src_buf + src_buf_offset, len_to_copy);
+ src_buf_offset += len_to_copy;
+
+ if (new_cluster.controldata.data_checksum_version > 0)
+ ((PageHeader) page)->pd_checksum = pg_checksum_page(page, dst_pageno);
+ if (write(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not write file \"%s\": %s", dst, strerror(errno));
+
+ dst_pageno++;
+ dst_offset = 0;
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Switch segments if we reached the end of the current segment.
+ */
+ if (dst_pageno % SLRU_PAGES_PER_SEGMENT == 0)
+ {
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ dst_segno++;
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+ }
+ }
+
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ pg_free(src_buf);
+ close(src_fd);
+}
+
+void
+upgrade_xact_cache(const char *src_subdir, const char *dst_subdir)
+{
+ char src_dir[MAXPGPATH];
+ char dst_dir[MAXPGPATH];
+
+ struct dirent **src_segment_files;
+ int num_files;
+ int i;
+
+ snprintf(src_dir, sizeof(src_dir), "%s/%s", old_cluster.pgdata, src_subdir);
+ snprintf(dst_dir, sizeof(dst_dir), "%s/%s", new_cluster.pgdata, dst_subdir);
+
+ num_files = scandir(src_dir, &src_segment_files, segment_file_filter, segment_file_compare);
+ if (num_files == -1)
+ pg_fatal("could not scan directory \"%s\": %s", src_dir, strerror(errno));
+
+ for (i = 0; i < num_files; i++)
+ {
+ upgrade_file(src_dir, src_segment_files[i]->d_name, dst_dir);
+ free(src_segment_files[i]);
+ }
+ free(src_segment_files);
+}
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 3960af4036..2b7d01058e 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -660,14 +660,23 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir)
static void
copy_xact_xlog_xid(void)
{
+ bool slru_header_changed = false;
+
/*
* Copy old commit logs to new data dir. pg_clog has been renamed to
* pg_xact in post-10 clusters.
*/
- copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact",
- GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact");
+ char *xact_old_dir = GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+ char *xact_new_dir = GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+
+ if (new_cluster.controldata.cat_ver >= SLRU_PAGE_HEADER_CAT_VER &&
+ old_cluster.controldata.cat_ver < SLRU_PAGE_HEADER_CAT_VER)
+ slru_header_changed = true;
+
+ if (slru_header_changed)
+ upgrade_xact_cache(xact_old_dir, xact_new_dir);
+ else
+ copy_subdir_files(xact_old_dir, xact_new_dir);
prep_status("Setting oldest XID for new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
@@ -702,7 +711,8 @@ copy_xact_xlog_xid(void)
* server doesn't attempt to read multis older than the cutoff value.
*/
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
- new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
+ !slru_header_changed)
{
copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
copy_subdir_files("pg_multixact/members", "pg_multixact/members");
@@ -722,7 +732,8 @@ copy_xact_xlog_xid(void)
new_cluster.pgdata);
check_ok();
}
- else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER ||
+ slru_header_changed)
{
/*
* Remove offsets/0000 file created by initdb that no longer matches
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index a710f325de..22186ea712 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -115,6 +115,11 @@ extern char *output_files[];
*/
#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
+/*
+ * A page header was added to each SLRU page in 17.0.
+ */
+#define SLRU_PAGE_HEADER_CAT_VER 202312091
+
/*
* large object chunk size added to pg_controldata,
* commit 5f93c37805e7485488480916b4585e098d3cc883
@@ -405,6 +410,7 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile,
const char *schemaName, const char *relName);
void check_file_clone(void);
void check_hard_link(void);
+void upgrade_xact_cache(const char *src_subdir, const char *dst_subdir);
/* fopen_priv() is no longer different from fopen() */
#define fopen_priv(path, mode) fopen(path, mode)
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 091e2202c9..24733166b8 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -68,17 +68,6 @@ typedef struct SlruSharedData
int *page_lru_count;
LWLockPadded *buffer_locks;
- /*
- * Optional array of WAL flush LSNs associated with entries in the SLRU
- * pages. If not zero/NULL, we must flush WAL before writing pages (true
- * for pg_xact, false for multixact, pg_subtrans, pg_notify). group_lsn[]
- * has lsn_groups_per_page entries per buffer slot, each containing the
- * highest LSN known for a contiguous group of SLRU entries on that slot's
- * page.
- */
- XLogRecPtr *group_lsn;
- int lsn_groups_per_page;
-
/*----------
* We mark a page "most recently used" by setting
* page_lru_count[slotno] = ++cur_lru_count;
@@ -147,8 +136,8 @@ typedef struct SlruCtlData
typedef SlruCtlData *SlruCtl;
-extern Size SimpleLruShmemSize(int nslots, int nlsns);
-extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+extern Size SimpleLruShmemSize(int nslots);
+extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler,
bool long_segment_names);
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index a06a8f0b23..af5aaded56 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202312071
+#define CATALOG_VERSION_NO 202312191
#endif
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 424ecba028..251d9523fa 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -201,6 +201,7 @@ typedef PageHeaderData *PageHeader;
* handling pages.
*/
#define PG_PAGE_LAYOUT_VERSION 4
+#define PG_SLRU_PAGE_LAYOUT_VERSION 1
#define PG_DATA_CHECKSUM_VERSION 1
/* ----------------------------------------------------------------
@@ -257,6 +258,11 @@ PageGetContents(Page page)
return (char *) page + MAXALIGN(SizeOfPageHeaderData);
}
+/*
+ * Space available for storing page contents.
+ */
+#define SizeOfPageContents (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
/* ----------------
* functions to access page size info
* ----------------
@@ -486,6 +492,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
"BLCKSZ has to be a multiple of sizeof(size_t)");
extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern void PageInitSLRU(Page page, Size pageSize, Size specialSize);
extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags);
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
OffsetNumber offsetNumber, int flags);
diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c
index d0fb9444e8..aa8a29d92e 100644
--- a/src/test/modules/test_slru/test_slru.c
+++ b/src/test/modules/test_slru/test_slru.c
@@ -17,6 +17,7 @@
#include "access/slru.h"
#include "access/transam.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
@@ -76,8 +77,8 @@ test_slru_page_write(PG_FUNCTION_ARGS)
TestSlruCtl->shared->page_status[slotno] = SLRU_PAGE_VALID;
/* write given data to the page, up to the limit of the page */
- strncpy(TestSlruCtl->shared->page_buffer[slotno], data,
- BLCKSZ - 1);
+ strncpy(PageGetContents(TestSlruCtl->shared->page_buffer[slotno]), data,
+ SizeOfPageContents - 1);
SimpleLruWritePage(TestSlruCtl, slotno);
LWLockRelease(TestSLRULock);
@@ -104,7 +105,7 @@ test_slru_page_read(PG_FUNCTION_ARGS)
LWLockAcquire(TestSLRULock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(TestSlruCtl, pageno,
write_ok, InvalidTransactionId);
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(TestSLRULock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -122,7 +123,7 @@ test_slru_page_readonly(PG_FUNCTION_ARGS)
pageno,
InvalidTransactionId);
Assert(LWLockHeldByMe(TestSLRULock));
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(TestSLRULock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -202,7 +203,7 @@ test_slru_shmem_request(void)
prev_shmem_request_hook();
/* reserve shared memory for the test SLRU */
- RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS, 0));
+ RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS));
}
static bool
@@ -238,7 +239,7 @@ test_slru_shmem_startup(void)
TestSlruCtl->PagePrecedes = test_slru_page_precedes_logically;
SimpleLruInit(TestSlruCtl, "TestSLRU",
- NUM_TEST_BUFFERS, 0, TestSLRULock, slru_dir_name,
+ NUM_TEST_BUFFERS, TestSLRULock, slru_dir_name,
test_tranche_id, SYNC_HANDLER_NONE, long_segment_names);
}
Hi,
I have also added this thread to the current Commitfest and hope this patch
will be part of the 17 release.The commitfest link:
https://commitfest.postgresql.org/46/4709/
Thanks for the updated patch.
cfbot seems to have some complaints regarding compiler warnings and
also building the patch on Windows:
--
Best regards,
Aleksander Alekseev
On Jan 2, 2024, at 19:35, Aleksander Alekseev <aleksander@timescale.com> wrote:
Thanks for the updated patch.
cfbot seems to have some complaints regarding compiler warnings and
also building the patch on Windows:
Thanks for the information. Here is the updated patch.
Regards,
Yong
Attachments:
slru_page_header_v3.patchapplication/octet-stream; name=slru_page_header_v3.patchDownload
.gitignore | 3 +
src/backend/access/transam/clog.c | 52 +++++-----
src/backend/access/transam/commit_ts.c | 26 +++--
src/backend/access/transam/multixact.c | 63 +++++++-----
src/backend/access/transam/slru.c | 113 +++++++---------------
src/backend/access/transam/subtrans.c | 12 +--
src/backend/commands/async.c | 27 +++---
src/backend/storage/lmgr/predicate.c | 16 +--
src/backend/storage/page/bufpage.c | 25 +++++
src/bin/pg_checksums/pg_checksums.c | 9 ++
src/bin/pg_resetwal/t/001_basic.pl | 6 +-
src/bin/pg_upgrade/file.c | 172 +++++++++++++++++++++++++++++++++
src/bin/pg_upgrade/pg_upgrade.c | 23 +++--
src/bin/pg_upgrade/pg_upgrade.h | 6 ++
src/include/access/slru.h | 15 +--
src/include/catalog/catversion.h | 2 +-
src/include/storage/bufpage.h | 7 ++
src/test/modules/test_slru/test_slru.c | 13 +--
18 files changed, 398 insertions(+), 192 deletions(-)
diff --git a/.gitignore b/.gitignore
index 4e911395fe..5b8d23ce9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,8 @@ win32ver.rc
*.exe
lib*dll.def
lib*.pc
+*.patch
+compile_commands.json
# Local excludes in root directory
/GNUmakefile
@@ -43,3 +45,4 @@ lib*.pc
/Release/
/tmp_install/
/portlock/
+/.cache
\ No newline at end of file
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 7dca1df61b..b7f690754b 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -41,6 +41,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/proc.h"
#include "storage/sync.h"
@@ -59,7 +60,7 @@
/* We need two bits per xact, so four xacts fit in a byte */
#define CLOG_BITS_PER_XACT 2
#define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACTS_PER_PAGE (SizeOfPageContents * CLOG_XACTS_PER_BYTE)
#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
@@ -77,13 +78,6 @@ TransactionIdToPage(TransactionId xid)
#define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
#define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
-/* We store the latest async LSN for each group of transactions */
-#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
-#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
-
-#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
- ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
-
/*
* The number of subtransactions below which we consider to apply clog group
* update optimization. Testing reveals that the number higher than this can
@@ -101,7 +95,7 @@ static SlruCtlData XactCtlData;
static int ZeroCLOGPage(int64 pageno, bool writeXlog);
static bool CLOGPagePrecedes(int64 page1, int64 page2);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
Oid oldestXactDb);
static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
@@ -583,8 +577,9 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
char *byteptr;
char byteval;
char curval;
+ Page page = XactCtl->shared->page_buffer[slotno];
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(page) + byteno;
curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
/*
@@ -613,7 +608,7 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*byteptr = byteval;
/*
- * Update the group LSN if the transaction completion LSN is higher.
+ * Update the page LSN if the transaction completion LSN is higher.
*
* Note: lsn will be invalid when supplied during InRecovery processing,
* so we don't need to do anything special to avoid LSN updates during
@@ -622,10 +617,8 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*/
if (!XLogRecPtrIsInvalid(lsn))
{
- int lsnindex = GetLSNIndex(slotno, xid);
-
- if (XactCtl->shared->group_lsn[lsnindex] < lsn)
- XactCtl->shared->group_lsn[lsnindex] = lsn;
+ if (PageGetLSN(page) < lsn)
+ PageSetLSN(page, lsn);
}
}
@@ -651,19 +644,19 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
int byteno = TransactionIdToByte(xid);
int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
int slotno;
- int lsnindex;
+ Page page;
char *byteptr;
XidStatus status;
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
- lsnindex = GetLSNIndex(slotno, xid);
- *lsn = XactCtl->shared->group_lsn[lsnindex];
+ *lsn = PageGetLSN(page);
LWLockRelease(XactSLRULock);
@@ -698,14 +691,14 @@ CLOGShmemBuffers(void)
Size
CLOGShmemSize(void)
{
- return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
+ return SimpleLruShmemSize(CLOGShmemBuffers());
}
void
CLOGShmemInit(void)
{
XactCtl->PagePrecedes = CLOGPagePrecedes;
- SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
+ SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(),
XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
SYNC_HANDLER_CLOG, false);
SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
@@ -747,11 +740,17 @@ static int
ZeroCLOGPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(XactCtl, pageno);
+ page = XactCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -807,12 +806,12 @@ TrimCLOG(void)
char *byteptr;
slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(XactCtl->shared->page_buffer[slotno]) + byteno;
/* Zero so-far-unused positions in the current byte */
*byteptr &= (1 << bshift) - 1;
/* Zero the rest of the page */
- MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+ MemSet(byteptr + 1, 0, SizeOfPageContents - byteno - 1);
XactCtl->shared->page_dirty[slotno] = true;
}
@@ -836,7 +835,6 @@ CheckPointCLOG(void)
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that CLOG has room for a newly-allocated XID.
*
@@ -958,12 +956,12 @@ CLOGPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
+ return XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index e6fd9b3349..367459ea04 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -31,6 +31,7 @@
#include "funcapi.h"
#include "miscadmin.h"
#include "pg_trace.h"
+#include "storage/bufpage.h"
#include "storage/shmem.h"
#include "utils/builtins.h"
#include "utils/snapmgr.h"
@@ -63,7 +64,7 @@ typedef struct CommitTimestampEntry
sizeof(RepOriginId))
#define COMMIT_TS_XACTS_PER_PAGE \
- (BLCKSZ / SizeOfCommitTimestampEntry)
+ (SizeOfPageContents / SizeOfCommitTimestampEntry)
/*
@@ -120,7 +121,7 @@ static int ZeroCommitTsPage(int64 pageno, bool writeXlog);
static bool CommitTsPagePrecedes(int64 page1, int64 page2);
static void ActivateCommitTs(void);
static void DeactivateCommitTs(void);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid);
/*
@@ -254,11 +255,12 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
CommitTimestampEntry entry;
Assert(TransactionIdIsNormal(xid));
+ Assert(xid == slotno * COMMIT_TS_XACTS_PER_PAGE + entryno);
entry.time = ts;
entry.nodeid = nodeid;
- memcpy(CommitTsCtl->shared->page_buffer[slotno] +
+ memcpy(PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
&entry, SizeOfCommitTimestampEntry);
}
@@ -337,7 +339,7 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
memcpy(&entry,
- CommitTsCtl->shared->page_buffer[slotno] +
+ PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
SizeOfCommitTimestampEntry);
@@ -515,7 +517,7 @@ CommitTsShmemBuffers(void)
Size
CommitTsShmemSize(void)
{
- return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
+ return SimpleLruShmemSize(CommitTsShmemBuffers()) +
sizeof(CommitTimestampShared);
}
@@ -529,7 +531,7 @@ CommitTsShmemInit(void)
bool found;
CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
- SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0,
+ SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(),
CommitTsSLRULock, "pg_commit_ts",
LWTRANCHE_COMMITTS_BUFFER,
SYNC_HANDLER_COMMIT_TS,
@@ -582,11 +584,17 @@ static int
ZeroCommitTsPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+ page = CommitTsCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -946,12 +954,12 @@ CommitTsPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
+ return XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index db3423f12e..0970aa8855 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -83,6 +83,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "postmaster/autovacuum.h"
+#include "storage/bufpage.h"
#include "storage/lmgr.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
@@ -106,7 +107,7 @@
*/
/* We need four bytes per offset */
-#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+#define MULTIXACT_OFFSETS_PER_PAGE (SizeOfPageContents / sizeof(MultiXactOffset))
#define MultiXactIdToOffsetPage(xid) \
((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
@@ -119,8 +120,8 @@
* additional flag bits for each TransactionId. To do this without getting
* into alignment issues, we store four bytes of flags, and then the
* corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
- * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
- * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 408 groups
+ * per page. This wastes 8 bytes per page, but that's OK -- simplicity (and
* performance) trumps space efficiency here.
*
* Note that the "offset" macros work with byte offset, not array indexes, so
@@ -138,7 +139,7 @@
/* size in bytes of a complete group */
#define MULTIXACT_MEMBERGROUP_SIZE \
(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
-#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (SizeOfPageContents / MULTIXACT_MEMBERGROUP_SIZE)
#define MULTIXACT_MEMBERS_PER_PAGE \
(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
@@ -366,7 +367,7 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
MultiXactOffset start, uint32 distance);
static bool SetOffsetVacuumLimit(bool is_startup);
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
-static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
+static XLogRecPtr WriteMZeroPageXlogRec(int64 pageno, uint8 info);
static void WriteMTruncateXlogRec(Oid oldestMultiDB,
MultiXactId startTruncOff,
MultiXactId endTruncOff,
@@ -884,7 +885,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
* take the trouble to generalize the slru.c error reporting code.
*/
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
*offptr = offset;
@@ -921,12 +922,12 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
}
memberptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
*memberptr = members[i].xid;
flagsptr = (uint32 *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
flagsval = *flagsptr;
flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
@@ -1348,7 +1349,7 @@ retry:
entryno = MultiXactIdToOffsetEntry(multi);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
@@ -1381,7 +1382,7 @@ retry:
if (pageno != prev_pageno)
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
nextMXOffset = *offptr;
@@ -1424,7 +1425,7 @@ retry:
}
xactptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
if (!TransactionIdIsValid(*xactptr))
{
@@ -1435,7 +1436,7 @@ retry:
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
- flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ flagsptr = (uint32 *) (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
ptr[truelength].xid = *xactptr;
ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
@@ -1834,8 +1835,8 @@ MultiXactShmemSize(void)
mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
size = SHARED_MULTIXACT_STATE_SIZE;
- size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0));
- size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS));
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS));
return size;
}
@@ -1851,14 +1852,14 @@ MultiXactShmemInit(void)
MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
SimpleLruInit(MultiXactOffsetCtl,
- "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
+ "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS,
MultiXactOffsetSLRULock, "pg_multixact/offsets",
LWTRANCHE_MULTIXACTOFFSET_BUFFER,
SYNC_HANDLER_MULTIXACT_OFFSET,
false);
SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
SimpleLruInit(MultiXactMemberCtl,
- "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
+ "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS,
MultiXactMemberSLRULock, "pg_multixact/members",
LWTRANCHE_MULTIXACTMEMBER_BUFFER,
SYNC_HANDLER_MULTIXACT_MEMBER,
@@ -1933,11 +1934,17 @@ static int
ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+ page = MultiXactOffsetCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -1949,11 +1956,17 @@ static int
ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
+ page = MultiXactMemberCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2071,10 +2084,10 @@ TrimMultiXact(void)
MultiXactOffset *offptr;
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
- MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+ MemSet(offptr, 0, SizeOfPageContents - (entryno * sizeof(MultiXactOffset)));
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
}
@@ -2104,9 +2117,9 @@ TrimMultiXact(void)
memberoff = MXOffsetToMemberOffset(offset);
slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
xidptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
- MemSet(xidptr, 0, BLCKSZ - memberoff);
+ MemSet(xidptr, 0, SizeOfPageContents - memberoff);
/*
* Note: we don't need to zero out the flag bits in the remaining
@@ -2758,7 +2771,7 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
LWLockRelease(MultiXactOffsetSLRULock);
@@ -3192,12 +3205,12 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
* Write an xlog record reflecting the zeroing of either a MEMBERs or
* OFFSETs page (info shows which)
*/
-static void
+static XLogRecPtr
WriteMZeroPageXlogRec(int64 pageno, uint8 info)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_MULTIXACT_ID, info);
+ return XLogInsert(RM_MULTIXACT_ID, info);
}
/*
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 7a371d9034..d7f3deea7d 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -57,6 +57,7 @@
#include "access/xlogutils.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/shmem.h"
@@ -154,13 +155,13 @@ typedef enum
SLRU_WRITE_FAILED,
SLRU_FSYNC_FAILED,
SLRU_CLOSE_FAILED,
+ SLRU_DATA_CORRUPTED,
} SlruErrorCause;
static SlruErrorCause slru_errcause;
static int slru_errno;
-static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
@@ -179,7 +180,7 @@ static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
*/
Size
-SimpleLruShmemSize(int nslots, int nlsns)
+SimpleLruShmemSize(int nslots)
{
Size sz;
@@ -192,9 +193,6 @@ SimpleLruShmemSize(int nslots, int nlsns)
sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
- if (nlsns > 0)
- sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
-
return BUFFERALIGN(sz) + BLCKSZ * nslots;
}
@@ -204,14 +202,13 @@ SimpleLruShmemSize(int nslots, int nlsns)
* ctl: address of local (unshared) control structure.
* name: name of SLRU. (This is user-visible, pick with care!)
* nslots: number of page slots to use.
- * nlsns: number of LSN groups per page (set to zero if not relevant).
* ctllock: LWLock to use to control access to the shared control structure.
* subdir: PGDATA-relative subdirectory that will contain the files.
* tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks.
* sync_handler: which set of functions to use to handle sync requests
*/
void
-SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler, bool long_segment_names)
{
@@ -219,7 +216,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
bool found;
shared = (SlruShared) ShmemInitStruct(name,
- SimpleLruShmemSize(nslots, nlsns),
+ SimpleLruShmemSize(nslots),
&found);
if (!IsUnderPostmaster)
@@ -236,7 +233,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
shared->ControlLock = ctllock;
shared->num_slots = nslots;
- shared->lsn_groups_per_page = nlsns;
shared->cur_lru_count = 0;
@@ -261,12 +257,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
shared->buffer_locks = (LWLockPadded *) (ptr + offset);
offset += MAXALIGN(nslots * sizeof(LWLockPadded));
- if (nlsns > 0)
- {
- shared->group_lsn = (XLogRecPtr *) (ptr + offset);
- offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
- }
-
ptr += BUFFERALIGN(offset);
for (slotno = 0; slotno < nslots; slotno++)
{
@@ -281,7 +271,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
}
/* Should fit to estimated shmem size */
- Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
+ Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots));
}
else
Assert(found);
@@ -323,11 +313,8 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
shared->page_dirty[slotno] = true;
SlruRecentlyUsed(shared, slotno);
- /* Set the buffer to zeroes */
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
-
- /* Set the LSNs for this new page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
+ /* Initialize the page. */
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
/* Assume this page is now the latest active page */
shared->latest_page_number = pageno;
@@ -338,26 +325,6 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
return slotno;
}
-/*
- * Zero all the LSNs we store for this slru page.
- *
- * This should be called each time we create a new page, and each time we read
- * in a page from disk into an existing buffer. (Such an old page cannot
- * have any interesting LSNs, since we'd have flushed them before writing
- * the page in the first place.)
- *
- * This assumes that InvalidXLogRecPtr is bitwise-all-0.
- */
-static void
-SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
-{
- SlruShared shared = ctl->shared;
-
- if (shared->lsn_groups_per_page > 0)
- MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
- shared->lsn_groups_per_page * sizeof(XLogRecPtr));
-}
-
/*
* Wait for any active I/O on a page slot to finish. (This does not
* guarantee that new I/O hasn't been started before we return, though.
@@ -478,9 +445,6 @@ SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
/* Do the read */
ok = SlruPhysicalReadPage(ctl, pageno, slotno);
- /* Set the LSNs for this newly read-in page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
-
/* Re-acquire control lock and update page state */
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
@@ -740,7 +704,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
ereport(LOG,
(errmsg("file \"%s\" doesn't exist, reading as zeroes",
path)));
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
return true;
}
@@ -763,6 +727,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
return false;
}
+ if (!PageIsVerifiedExtended(shared->page_buffer[slotno], pageno, PIV_REPORT_STAT))
+ {
+ slru_errcause = SLRU_DATA_CORRUPTED;
+ slru_errno = 0;
+ return false;
+ }
+
return true;
}
@@ -789,6 +760,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd = -1;
+ Page page = shared->page_buffer[slotno];
+ XLogRecPtr lsn;
/* update the stats counter of written pages */
pgstat_count_slru_page_written(shared->slru_stats_idx);
@@ -798,41 +771,18 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
* write out data before associated WAL records. This is the same action
* performed during FlushBuffer() in the main buffer manager.
*/
- if (shared->group_lsn != NULL)
+ lsn = PageGetLSN(page);
+ if (!XLogRecPtrIsInvalid(lsn))
{
/*
- * We must determine the largest async-commit LSN for the page. This
- * is a bit tedious, but since this entire function is a slow path
- * anyway, it seems better to do this here than to maintain a per-page
- * LSN variable (which'd need an extra comparison in the
- * transaction-commit path).
+ * As noted above, elog(ERROR) is not acceptable here, so if
+ * XLogFlush were to fail, we must PANIC. This isn't much of a
+ * restriction because XLogFlush is just about all critical
+ * section anyway, but let's make sure.
*/
- XLogRecPtr max_lsn;
- int lsnindex,
- lsnoff;
-
- lsnindex = slotno * shared->lsn_groups_per_page;
- max_lsn = shared->group_lsn[lsnindex++];
- for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
- {
- XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
-
- if (max_lsn < this_lsn)
- max_lsn = this_lsn;
- }
-
- if (!XLogRecPtrIsInvalid(max_lsn))
- {
- /*
- * As noted above, elog(ERROR) is not acceptable here, so if
- * XLogFlush were to fail, we must PANIC. This isn't much of a
- * restriction because XLogFlush is just about all critical
- * section anyway, but let's make sure.
- */
- START_CRIT_SECTION();
- XLogFlush(max_lsn);
- END_CRIT_SECTION();
- }
+ START_CRIT_SECTION();
+ XLogFlush(lsn);
+ END_CRIT_SECTION();
}
/*
@@ -899,6 +849,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
}
}
+ PageSetChecksumInplace(shared->page_buffer[slotno], pageno);
+
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
@@ -1019,6 +971,13 @@ SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
errdetail("Could not close file \"%s\": %m.",
path)));
break;
+ case SLRU_DATA_CORRUPTED:
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Invalid page from file \"%s\" at offset %d.",
+ path, offset)));
+ break;
default:
/* can't get here, we trust */
elog(ERROR, "unrecognized SimpleLru error cause: %d",
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 1b3b3ad720..1f35ee23c6 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -32,6 +32,7 @@
#include "access/subtrans.h"
#include "access/transam.h"
#include "pg_trace.h"
+#include "storage/bufpage.h"
#include "utils/snapmgr.h"
@@ -49,7 +50,7 @@
*/
/* We need four bytes per xact */
-#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+#define SUBTRANS_XACTS_PER_PAGE (SizeOfPageContents / sizeof(TransactionId))
/*
* Although we return an int64 the actual value can't currently exceed
@@ -93,7 +94,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
/*
@@ -133,7 +134,7 @@ SubTransGetParent(TransactionId xid)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
parent = *ptr;
@@ -193,14 +194,14 @@ SubTransGetTopmostTransaction(TransactionId xid)
Size
SUBTRANSShmemSize(void)
{
- return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
+ return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS);
}
void
SUBTRANSShmemInit(void)
{
SubTransCtl->PagePrecedes = SubTransPagePrecedes;
- SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
+ SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS,
SubtransSLRULock, "pg_subtrans",
LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE,
false);
@@ -305,7 +306,6 @@ CheckPointSUBTRANS(void)
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that SUBTRANS has room for a newly-allocated XID.
*
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 264f25a8f9..85b0d63cdc 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -140,6 +140,7 @@
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
@@ -162,7 +163,7 @@
* than that, so changes in that data structure won't affect user-visible
* restrictions.
*/
-#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128)
+#define NOTIFY_PAYLOAD_MAX_LENGTH (SizeOfPageContents - NAMEDATALEN - 128)
/*
* Struct representing an entry in the global notify queue
@@ -311,7 +312,7 @@ static SlruCtlData NotifyCtlData;
#define NotifyCtl (&NotifyCtlData)
#define QUEUE_PAGESIZE BLCKSZ
-
+#define QUEUE_PAGE_CAPACITY (QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData))
#define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */
/*
@@ -492,7 +493,7 @@ AsyncShmemSize(void)
size = mul_size(MaxBackends + 1, sizeof(QueueBackendStatus));
size = add_size(size, offsetof(AsyncQueueControl, backend));
- size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS));
return size;
}
@@ -541,7 +542,7 @@ AsyncShmemInit(void)
* names are used in order to avoid wraparound.
*/
NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
- SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0,
+ SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS,
NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER,
SYNC_HANDLER_NONE, true);
@@ -1301,14 +1302,14 @@ asyncQueueAdvance(volatile QueuePosition *position, int entryLength)
* written or read.
*/
offset += entryLength;
- Assert(offset <= QUEUE_PAGESIZE);
+ Assert(offset <= QUEUE_PAGE_CAPACITY);
/*
* In a second step check if another entry can possibly be written to the
* page. If so, stay here, we have reached the next position. If not, then
* we need to move on to the next page.
*/
- if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGESIZE)
+ if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGE_CAPACITY)
{
pageno++;
offset = 0;
@@ -1408,7 +1409,7 @@ asyncQueueAddEntries(ListCell *nextNotify)
offset = QUEUE_POS_OFFSET(queue_head);
/* Check whether the entry really fits on the current page */
- if (offset + qe.length <= QUEUE_PAGESIZE)
+ if (offset + qe.length <= QUEUE_PAGE_CAPACITY)
{
/* OK, so advance nextNotify past this item */
nextNotify = lnext(pendingNotifies->events, nextNotify);
@@ -1420,14 +1421,14 @@ asyncQueueAddEntries(ListCell *nextNotify)
* only check dboid and since it won't match any reader's database
* OID, they will ignore this entry and move on.
*/
- qe.length = QUEUE_PAGESIZE - offset;
+ qe.length = QUEUE_PAGE_CAPACITY - offset;
qe.dboid = InvalidOid;
qe.data[0] = '\0'; /* empty channel */
qe.data[1] = '\0'; /* empty payload */
}
/* Now copy qe into the shared buffer page */
- memcpy(NotifyCtl->shared->page_buffer[slotno] + offset,
+ memcpy(PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + offset,
&qe,
qe.length);
@@ -1947,10 +1948,10 @@ asyncQueueReadAllNotifications(void)
else
{
/* fetch all the rest of the page */
- copysize = QUEUE_PAGESIZE - curoffset;
+ copysize = QUEUE_PAGE_CAPACITY - curoffset;
}
- memcpy(page_buffer.buf + curoffset,
- NotifyCtl->shared->page_buffer[slotno] + curoffset,
+ memcpy(PageGetContents(page_buffer.buf) + curoffset,
+ PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + curoffset,
copysize);
/* Release lock that we got from SimpleLruReadPage_ReadOnly() */
LWLockRelease(NotifySLRULock);
@@ -2021,7 +2022,7 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current,
if (QUEUE_POS_EQUAL(thisentry, stop))
break;
- qe = (AsyncQueueEntry *) (page_buffer + QUEUE_POS_OFFSET(thisentry));
+ qe = (AsyncQueueEntry *) (PageGetContents(page_buffer) + QUEUE_POS_OFFSET(thisentry));
/*
* Advance *current over this message, possibly to the next page. As
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 1129b8e4f2..9fd945ad38 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -204,6 +204,7 @@
#include "pgstat.h"
#include "port/pg_lfind.h"
#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
#include "storage/predicate.h"
#include "storage/predicate_internals.h"
#include "storage/proc.h"
@@ -322,8 +323,8 @@ static SlruCtlData SerialSlruCtlData;
#define SerialSlruCtl (&SerialSlruCtlData)
#define SERIAL_PAGESIZE BLCKSZ
-#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
-#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE - MAXALIGN(SizeOfPageHeaderData) / SERIAL_ENTRYSIZE)
/*
* Set maximum pages based on the number needed to track all transactions.
@@ -333,7 +334,7 @@ static SlruCtlData SerialSlruCtlData;
#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
- (SerialSlruCtl->shared->page_buffer[slotno] + \
+ (PageGetContents(SerialSlruCtl->shared->page_buffer[slotno]) + \
((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
@@ -785,10 +786,13 @@ SerialPagePrecedesLogicallyUnitTests(void)
* requires burning ~2B XIDs in single-user mode, a negligible
* possibility. Moreover, if it does happen, the consequence would be
* mild, namely a new transaction failing in SimpleLruReadPage().
+ *
+ * NOTE: After adding the page header, the defect affects two pages.
+ * We now assert correct treatment of its second to prior page.
*/
headPage = oldestPage;
targetPage = newestPage;
- Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+ Assert(SerialPagePrecedesLogically(headPage, targetPage - 2));
#if 0
Assert(SerialPagePrecedesLogically(headPage, targetPage));
#endif
@@ -808,7 +812,7 @@ SerialInit(void)
*/
SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
SimpleLruInit(SerialSlruCtl, "Serial",
- NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
+ NUM_SERIAL_BUFFERS, SerialSLRULock, "pg_serial",
LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE,
false);
#ifdef USE_ASSERT_CHECKING
@@ -1348,7 +1352,7 @@ PredicateLockShmemSize(void)
/* Shared memory structures for SLRU tracking of old committed xids. */
size = add_size(size, sizeof(SerialControlData));
- size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS));
return size;
}
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 9a302ddc30..723a127594 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -59,6 +59,31 @@ PageInit(Page page, Size pageSize, Size specialSize)
/* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
}
+/*
+ * PageInitSLRU
+ * Initializes the contents of an SLRU page.
+ * Note that we don't calculate an initial checksum here; that's not done
+ * until it's time to write.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
/*
* PageIsVerifiedExtended
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 19c083be17..0f43533811 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -16,6 +16,7 @@
#include <dirent.h>
#include <limits.h>
+#include <stdbool.h>
#include <time.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -589,12 +590,20 @@ main(int argc, char *argv[])
{
total_size = scan_directory(DataDir, "global", true);
total_size += scan_directory(DataDir, "base", true);
+ total_size += scan_directory(DataDir, "pg_commit_ts", true);
+ total_size += scan_directory(DataDir, "pg_multixact", true);
+ total_size += scan_directory(DataDir, "pg_serial", true);
total_size += scan_directory(DataDir, "pg_tblspc", true);
+ total_size += scan_directory(DataDir, "pg_xact", true);
}
(void) scan_directory(DataDir, "global", false);
(void) scan_directory(DataDir, "base", false);
+ (void) scan_directory(DataDir, "pg_commit_ts", false);
+ (void) scan_directory(DataDir, "pg_multixact", false);
+ (void) scan_directory(DataDir, "pg_serial", false);
(void) scan_directory(DataDir, "pg_tblspc", false);
+ (void) scan_directory(DataDir, "pg_xact", false);
if (showprogress)
progress_report(true);
diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl
index dcb5fa846e..914afbda29 100644
--- a/src/bin/pg_resetwal/t/001_basic.pl
+++ b/src/bin/pg_resetwal/t/001_basic.pl
@@ -206,7 +206,7 @@ push @cmd,
sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1]));
@files = get_slru_files('pg_multixact/offsets');
-$mult = 32 * $blcksz / 4;
+$mult = 32 * ($blcksz - 24) / 4;
# -m argument is "new,old"
push @cmd, '-m',
sprintf("%d,%d",
@@ -214,11 +214,11 @@ push @cmd, '-m',
hex($files[0]) == 0 ? 1 : hex($files[0] * $mult));
@files = get_slru_files('pg_multixact/members');
-$mult = 32 * int($blcksz / 20) * 4;
+$mult = 32 * int(($blcksz - 24) / 20) * 4;
push @cmd, '-O', (hex($files[-1]) + 1) * $mult;
@files = get_slru_files('pg_xact');
-$mult = 32 * $blcksz * 4;
+$mult = 32 * ($blcksz - 24) * 4;
push @cmd,
'-u', (hex($files[0]) == 0 ? 3 : hex($files[0]) * $mult),
'-x', ((hex($files[-1]) + 1) * $mult);
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index d173602882..c6be340736 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -9,6 +9,7 @@
#include "postgres_fe.h"
+#include <dirent.h>
#include <sys/stat.h>
#include <fcntl.h>
#ifdef HAVE_COPYFILE_H
@@ -375,3 +376,174 @@ check_hard_link(void)
unlink(new_link_file);
}
+
+
+/*
+ * Copy SLRU_PAGES_PER_SEGMENT from access/slru.h to avoid including it.
+ */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+#define SEGMENT_SIZE (BLCKSZ * SLRU_PAGES_PER_SEGMENT)
+
+/*
+ * Copy PageInitSLRU from storage/bufpage.c to avoid linking to the backend.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
+
+/*
+ * Filter function for scandir(3) to select only segment files.
+ */
+static int
+segment_file_filter(const struct dirent *dirent)
+{
+ return strspn(dirent->d_name, "0123456789ABCDEF") == strlen(dirent->d_name);
+}
+
+static void
+upgrade_file(const char *src_dir, const char *src_file, const char *dst_dir)
+{
+ char src[MAXPGPATH];
+ char dst[MAXPGPATH];
+
+ int seg_name_len;
+ int src_segno;
+ int64 src_pageno;
+ int dst_segno;
+ int64 dst_pageno;
+ int dst_offset;
+
+ int src_fd;
+ int dst_fd;
+
+ char *src_buf;
+ ssize_t src_len;
+ ssize_t src_buf_offset;
+ PGAlignedBlock dst_block;
+ Page page = dst_block.data;
+ int len_to_copy;
+
+ seg_name_len = strlen(src_file);
+ src_segno = (int) strtol(src_file, NULL, 16);
+ src_pageno = src_segno * SLRU_PAGES_PER_SEGMENT;
+
+ dst_pageno = src_pageno * BLCKSZ / SizeOfPageContents;
+ dst_offset = src_pageno * BLCKSZ - dst_pageno * SizeOfPageContents;
+ dst_segno = dst_pageno / SLRU_PAGES_PER_SEGMENT;
+
+ snprintf(src, sizeof(src), "%s/%s", src_dir, src_file);
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+
+ src_buf = pg_malloc(SEGMENT_SIZE);
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) == -1)
+ pg_fatal("could not open file \"%s\": %s", src, strerror(errno));
+ if ((src_len = read(src_fd, src_buf, SEGMENT_SIZE)) == -1)
+ pg_fatal("could not read file \"%s\": %s", src, strerror(errno));
+
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+
+ /*
+ * Read the destination page at dst_pageno into the buffer. The page may contain
+ * data from the previous source segment. Initialize the page if the page is new.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+ if (read(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not read file \"%s\": %s", dst, strerror(errno));
+ if (PageIsNew(page))
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Rewind the file position, so the first write will overwrite the page.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+
+ src_buf_offset = 0;
+ while (src_buf_offset < src_len)
+ {
+ len_to_copy = Min(src_len - src_buf_offset, SizeOfPageContents - dst_offset);
+ memcpy(PageGetContents(page) + dst_offset, src_buf + src_buf_offset, len_to_copy);
+ src_buf_offset += len_to_copy;
+
+ if (new_cluster.controldata.data_checksum_version > 0)
+ ((PageHeader) page)->pd_checksum = pg_checksum_page(page, dst_pageno);
+ if (write(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not write file \"%s\": %s", dst, strerror(errno));
+
+ dst_pageno++;
+ dst_offset = 0;
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Switch segments if we reached the end of the current segment.
+ */
+ if (dst_pageno % SLRU_PAGES_PER_SEGMENT == 0)
+ {
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ dst_segno++;
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+ }
+ }
+
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ pg_free(src_buf);
+ close(src_fd);
+}
+
+void
+upgrade_xact_cache(const char *src_subdir, const char *dst_subdir)
+{
+ char src_dir[MAXPGPATH];
+ char dst_dir[MAXPGPATH];
+
+ DIR *src_dirp;
+ struct dirent *src_dirent;
+
+ snprintf(src_dir, sizeof(src_dir), "%s/%s", old_cluster.pgdata, src_subdir);
+ snprintf(dst_dir, sizeof(dst_dir), "%s/%s", new_cluster.pgdata, dst_subdir);
+
+ if ((src_dirp = opendir(src_dir)) == NULL)
+ pg_fatal("could not open directory \"%s\": %s", src_dir, strerror(errno));
+
+ while (errno = 0, (src_dirent = readdir(src_dirp)) != NULL)
+ {
+ if (segment_file_filter(src_dirent))
+ upgrade_file(src_dir, src_dirent->d_name, dst_dir);
+ }
+
+ if (closedir(src_dirp) != 0)
+ pg_fatal("could not close directory \"%s\": %s", src_dir, strerror(errno));
+}
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 3960af4036..2b7d01058e 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -660,14 +660,23 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir)
static void
copy_xact_xlog_xid(void)
{
+ bool slru_header_changed = false;
+
/*
* Copy old commit logs to new data dir. pg_clog has been renamed to
* pg_xact in post-10 clusters.
*/
- copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact",
- GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact");
+ char *xact_old_dir = GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+ char *xact_new_dir = GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+
+ if (new_cluster.controldata.cat_ver >= SLRU_PAGE_HEADER_CAT_VER &&
+ old_cluster.controldata.cat_ver < SLRU_PAGE_HEADER_CAT_VER)
+ slru_header_changed = true;
+
+ if (slru_header_changed)
+ upgrade_xact_cache(xact_old_dir, xact_new_dir);
+ else
+ copy_subdir_files(xact_old_dir, xact_new_dir);
prep_status("Setting oldest XID for new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
@@ -702,7 +711,8 @@ copy_xact_xlog_xid(void)
* server doesn't attempt to read multis older than the cutoff value.
*/
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
- new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
+ !slru_header_changed)
{
copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
copy_subdir_files("pg_multixact/members", "pg_multixact/members");
@@ -722,7 +732,8 @@ copy_xact_xlog_xid(void)
new_cluster.pgdata);
check_ok();
}
- else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER ||
+ slru_header_changed)
{
/*
* Remove offsets/0000 file created by initdb that no longer matches
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index d63f13fffc..c955e903a5 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -115,6 +115,11 @@ extern char *output_files[];
*/
#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
+/*
+ * A page header was added to each SLRU page in 17.0.
+ */
+#define SLRU_PAGE_HEADER_CAT_VER 202312091
+
/*
* large object chunk size added to pg_controldata,
* commit 5f93c37805e7485488480916b4585e098d3cc883
@@ -406,6 +411,7 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile,
const char *schemaName, const char *relName);
void check_file_clone(void);
void check_hard_link(void);
+void upgrade_xact_cache(const char *src_subdir, const char *dst_subdir);
/* fopen_priv() is no longer different from fopen() */
#define fopen_priv(path, mode) fopen(path, mode)
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 091e2202c9..24733166b8 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -68,17 +68,6 @@ typedef struct SlruSharedData
int *page_lru_count;
LWLockPadded *buffer_locks;
- /*
- * Optional array of WAL flush LSNs associated with entries in the SLRU
- * pages. If not zero/NULL, we must flush WAL before writing pages (true
- * for pg_xact, false for multixact, pg_subtrans, pg_notify). group_lsn[]
- * has lsn_groups_per_page entries per buffer slot, each containing the
- * highest LSN known for a contiguous group of SLRU entries on that slot's
- * page.
- */
- XLogRecPtr *group_lsn;
- int lsn_groups_per_page;
-
/*----------
* We mark a page "most recently used" by setting
* page_lru_count[slotno] = ++cur_lru_count;
@@ -147,8 +136,8 @@ typedef struct SlruCtlData
typedef SlruCtlData *SlruCtl;
-extern Size SimpleLruShmemSize(int nslots, int nlsns);
-extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+extern Size SimpleLruShmemSize(int nslots);
+extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler,
bool long_segment_names);
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index b3fdbd9066..be4b8e1c7e 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202401021
+#define CATALOG_VERSION_NO 202401031
#endif
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 424ecba028..251d9523fa 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -201,6 +201,7 @@ typedef PageHeaderData *PageHeader;
* handling pages.
*/
#define PG_PAGE_LAYOUT_VERSION 4
+#define PG_SLRU_PAGE_LAYOUT_VERSION 1
#define PG_DATA_CHECKSUM_VERSION 1
/* ----------------------------------------------------------------
@@ -257,6 +258,11 @@ PageGetContents(Page page)
return (char *) page + MAXALIGN(SizeOfPageHeaderData);
}
+/*
+ * Space available for storing page contents.
+ */
+#define SizeOfPageContents (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
/* ----------------
* functions to access page size info
* ----------------
@@ -486,6 +492,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
"BLCKSZ has to be a multiple of sizeof(size_t)");
extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern void PageInitSLRU(Page page, Size pageSize, Size specialSize);
extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags);
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
OffsetNumber offsetNumber, int flags);
diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c
index d0fb9444e8..aa8a29d92e 100644
--- a/src/test/modules/test_slru/test_slru.c
+++ b/src/test/modules/test_slru/test_slru.c
@@ -17,6 +17,7 @@
#include "access/slru.h"
#include "access/transam.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
@@ -76,8 +77,8 @@ test_slru_page_write(PG_FUNCTION_ARGS)
TestSlruCtl->shared->page_status[slotno] = SLRU_PAGE_VALID;
/* write given data to the page, up to the limit of the page */
- strncpy(TestSlruCtl->shared->page_buffer[slotno], data,
- BLCKSZ - 1);
+ strncpy(PageGetContents(TestSlruCtl->shared->page_buffer[slotno]), data,
+ SizeOfPageContents - 1);
SimpleLruWritePage(TestSlruCtl, slotno);
LWLockRelease(TestSLRULock);
@@ -104,7 +105,7 @@ test_slru_page_read(PG_FUNCTION_ARGS)
LWLockAcquire(TestSLRULock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(TestSlruCtl, pageno,
write_ok, InvalidTransactionId);
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(TestSLRULock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -122,7 +123,7 @@ test_slru_page_readonly(PG_FUNCTION_ARGS)
pageno,
InvalidTransactionId);
Assert(LWLockHeldByMe(TestSLRULock));
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(TestSLRULock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -202,7 +203,7 @@ test_slru_shmem_request(void)
prev_shmem_request_hook();
/* reserve shared memory for the test SLRU */
- RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS, 0));
+ RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS));
}
static bool
@@ -238,7 +239,7 @@ test_slru_shmem_startup(void)
TestSlruCtl->PagePrecedes = test_slru_page_precedes_logically;
SimpleLruInit(TestSlruCtl, "TestSLRU",
- NUM_TEST_BUFFERS, 0, TestSLRULock, slru_dir_name,
+ NUM_TEST_BUFFERS, TestSLRULock, slru_dir_name,
test_tranche_id, SYNC_HANDLER_NONE, long_segment_names);
}
Rebase the patch against the latest HEAD.
Regards,
Yong
Attachments:
slru_page_header_v4.patchapplication/octet-stream; name=slru_page_header_v4.patchDownload
src/backend/access/transam/clog.c | 52 +++++-----
src/backend/access/transam/commit_ts.c | 26 +++--
src/backend/access/transam/multixact.c | 63 +++++++-----
src/backend/access/transam/slru.c | 113 +++++++---------------
src/backend/access/transam/subtrans.c | 12 +--
src/backend/commands/async.c | 27 +++---
src/backend/storage/lmgr/predicate.c | 16 +--
src/backend/storage/page/bufpage.c | 25 +++++
src/bin/pg_checksums/pg_checksums.c | 9 ++
src/bin/pg_resetwal/t/001_basic.pl | 6 +-
src/bin/pg_upgrade/file.c | 172 +++++++++++++++++++++++++++++++++
src/bin/pg_upgrade/pg_upgrade.c | 23 +++--
src/bin/pg_upgrade/pg_upgrade.h | 6 ++
src/include/access/slru.h | 15 +--
src/include/catalog/catversion.h | 2 +-
src/include/storage/bufpage.h | 7 ++
src/test/modules/test_slru/test_slru.c | 13 +--
17 files changed, 395 insertions(+), 192 deletions(-)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index f6e7da7ffc..ebfc6e1ef8 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -41,6 +41,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/proc.h"
#include "storage/sync.h"
@@ -59,7 +60,7 @@
/* We need two bits per xact, so four xacts fit in a byte */
#define CLOG_BITS_PER_XACT 2
#define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACTS_PER_PAGE (SizeOfPageContents * CLOG_XACTS_PER_BYTE)
#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
@@ -77,13 +78,6 @@ TransactionIdToPage(TransactionId xid)
#define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
#define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
-/* We store the latest async LSN for each group of transactions */
-#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
-#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
-
-#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
- ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
-
/*
* The number of subtransactions below which we consider to apply clog group
* update optimization. Testing reveals that the number higher than this can
@@ -101,7 +95,7 @@ static SlruCtlData XactCtlData;
static int ZeroCLOGPage(int64 pageno, bool writeXlog);
static bool CLOGPagePrecedes(int64 page1, int64 page2);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
Oid oldestXactDb);
static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
@@ -583,8 +577,9 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
char *byteptr;
char byteval;
char curval;
+ Page page = XactCtl->shared->page_buffer[slotno];
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(page) + byteno;
curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
/*
@@ -613,7 +608,7 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*byteptr = byteval;
/*
- * Update the group LSN if the transaction completion LSN is higher.
+ * Update the page LSN if the transaction completion LSN is higher.
*
* Note: lsn will be invalid when supplied during InRecovery processing,
* so we don't need to do anything special to avoid LSN updates during
@@ -622,10 +617,8 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*/
if (!XLogRecPtrIsInvalid(lsn))
{
- int lsnindex = GetLSNIndex(slotno, xid);
-
- if (XactCtl->shared->group_lsn[lsnindex] < lsn)
- XactCtl->shared->group_lsn[lsnindex] = lsn;
+ if (PageGetLSN(page) < lsn)
+ PageSetLSN(page, lsn);
}
}
@@ -651,19 +644,19 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
int byteno = TransactionIdToByte(xid);
int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
int slotno;
- int lsnindex;
+ Page page;
char *byteptr;
XidStatus status;
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
- lsnindex = GetLSNIndex(slotno, xid);
- *lsn = XactCtl->shared->group_lsn[lsnindex];
+ *lsn = PageGetLSN(page);
LWLockRelease(XactSLRULock);
@@ -698,14 +691,14 @@ CLOGShmemBuffers(void)
Size
CLOGShmemSize(void)
{
- return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
+ return SimpleLruShmemSize(CLOGShmemBuffers());
}
void
CLOGShmemInit(void)
{
XactCtl->PagePrecedes = CLOGPagePrecedes;
- SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
+ SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(),
XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
SYNC_HANDLER_CLOG, false);
SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
@@ -747,11 +740,17 @@ static int
ZeroCLOGPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(XactCtl, pageno);
+ page = XactCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -807,12 +806,12 @@ TrimCLOG(void)
char *byteptr;
slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(XactCtl->shared->page_buffer[slotno]) + byteno;
/* Zero so-far-unused positions in the current byte */
*byteptr &= (1 << bshift) - 1;
/* Zero the rest of the page */
- MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+ MemSet(byteptr + 1, 0, SizeOfPageContents - byteno - 1);
XactCtl->shared->page_dirty[slotno] = true;
}
@@ -836,7 +835,6 @@ CheckPointCLOG(void)
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that CLOG has room for a newly-allocated XID.
*
@@ -958,12 +956,12 @@ CLOGPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
+ return XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 61b82385f3..cc9388f5d2 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -31,6 +31,7 @@
#include "funcapi.h"
#include "miscadmin.h"
#include "pg_trace.h"
+#include "storage/bufpage.h"
#include "storage/shmem.h"
#include "utils/builtins.h"
#include "utils/snapmgr.h"
@@ -63,7 +64,7 @@ typedef struct CommitTimestampEntry
sizeof(RepOriginId))
#define COMMIT_TS_XACTS_PER_PAGE \
- (BLCKSZ / SizeOfCommitTimestampEntry)
+ (SizeOfPageContents / SizeOfCommitTimestampEntry)
/*
@@ -120,7 +121,7 @@ static int ZeroCommitTsPage(int64 pageno, bool writeXlog);
static bool CommitTsPagePrecedes(int64 page1, int64 page2);
static void ActivateCommitTs(void);
static void DeactivateCommitTs(void);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid);
/*
@@ -254,11 +255,12 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
CommitTimestampEntry entry;
Assert(TransactionIdIsNormal(xid));
+ Assert(xid == slotno * COMMIT_TS_XACTS_PER_PAGE + entryno);
entry.time = ts;
entry.nodeid = nodeid;
- memcpy(CommitTsCtl->shared->page_buffer[slotno] +
+ memcpy(PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
&entry, SizeOfCommitTimestampEntry);
}
@@ -337,7 +339,7 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
memcpy(&entry,
- CommitTsCtl->shared->page_buffer[slotno] +
+ PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
SizeOfCommitTimestampEntry);
@@ -515,7 +517,7 @@ CommitTsShmemBuffers(void)
Size
CommitTsShmemSize(void)
{
- return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
+ return SimpleLruShmemSize(CommitTsShmemBuffers()) +
sizeof(CommitTimestampShared);
}
@@ -529,7 +531,7 @@ CommitTsShmemInit(void)
bool found;
CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
- SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0,
+ SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(),
CommitTsSLRULock, "pg_commit_ts",
LWTRANCHE_COMMITTS_BUFFER,
SYNC_HANDLER_COMMIT_TS,
@@ -582,11 +584,17 @@ static int
ZeroCommitTsPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+ page = CommitTsCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -946,12 +954,12 @@ CommitTsPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
+ return XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 59523be901..af6563c889 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -83,6 +83,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "postmaster/autovacuum.h"
+#include "storage/bufpage.h"
#include "storage/lmgr.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
@@ -106,7 +107,7 @@
*/
/* We need four bytes per offset */
-#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+#define MULTIXACT_OFFSETS_PER_PAGE (SizeOfPageContents / sizeof(MultiXactOffset))
#define MultiXactIdToOffsetPage(xid) \
((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
@@ -119,8 +120,8 @@
* additional flag bits for each TransactionId. To do this without getting
* into alignment issues, we store four bytes of flags, and then the
* corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
- * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
- * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 408 groups
+ * per page. This wastes 8 bytes per page, but that's OK -- simplicity (and
* performance) trumps space efficiency here.
*
* Note that the "offset" macros work with byte offset, not array indexes, so
@@ -138,7 +139,7 @@
/* size in bytes of a complete group */
#define MULTIXACT_MEMBERGROUP_SIZE \
(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
-#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (SizeOfPageContents / MULTIXACT_MEMBERGROUP_SIZE)
#define MULTIXACT_MEMBERS_PER_PAGE \
(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
@@ -366,7 +367,7 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
MultiXactOffset start, uint32 distance);
static bool SetOffsetVacuumLimit(bool is_startup);
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
-static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
+static XLogRecPtr WriteMZeroPageXlogRec(int64 pageno, uint8 info);
static void WriteMTruncateXlogRec(Oid oldestMultiDB,
MultiXactId startTruncOff,
MultiXactId endTruncOff,
@@ -884,7 +885,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
* take the trouble to generalize the slru.c error reporting code.
*/
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
*offptr = offset;
@@ -921,12 +922,12 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
}
memberptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
*memberptr = members[i].xid;
flagsptr = (uint32 *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
flagsval = *flagsptr;
flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
@@ -1348,7 +1349,7 @@ retry:
entryno = MultiXactIdToOffsetEntry(multi);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
@@ -1381,7 +1382,7 @@ retry:
if (pageno != prev_pageno)
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
nextMXOffset = *offptr;
@@ -1424,7 +1425,7 @@ retry:
}
xactptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
if (!TransactionIdIsValid(*xactptr))
{
@@ -1435,7 +1436,7 @@ retry:
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
- flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ flagsptr = (uint32 *) (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
ptr[truelength].xid = *xactptr;
ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
@@ -1834,8 +1835,8 @@ MultiXactShmemSize(void)
mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
size = SHARED_MULTIXACT_STATE_SIZE;
- size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0));
- size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS));
+ size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS));
return size;
}
@@ -1851,14 +1852,14 @@ MultiXactShmemInit(void)
MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
SimpleLruInit(MultiXactOffsetCtl,
- "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
+ "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS,
MultiXactOffsetSLRULock, "pg_multixact/offsets",
LWTRANCHE_MULTIXACTOFFSET_BUFFER,
SYNC_HANDLER_MULTIXACT_OFFSET,
false);
SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
SimpleLruInit(MultiXactMemberCtl,
- "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
+ "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS,
MultiXactMemberSLRULock, "pg_multixact/members",
LWTRANCHE_MULTIXACTMEMBER_BUFFER,
SYNC_HANDLER_MULTIXACT_MEMBER,
@@ -1933,11 +1934,17 @@ static int
ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+ page = MultiXactOffsetCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -1949,11 +1956,17 @@ static int
ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
+ page = MultiXactMemberCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2071,10 +2084,10 @@ TrimMultiXact(void)
MultiXactOffset *offptr;
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
- MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+ MemSet(offptr, 0, SizeOfPageContents - (entryno * sizeof(MultiXactOffset)));
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
}
@@ -2104,9 +2117,9 @@ TrimMultiXact(void)
memberoff = MXOffsetToMemberOffset(offset);
slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
xidptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
- MemSet(xidptr, 0, BLCKSZ - memberoff);
+ MemSet(xidptr, 0, SizeOfPageContents - memberoff);
/*
* Note: we don't need to zero out the flag bits in the remaining
@@ -2758,7 +2771,7 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
LWLockRelease(MultiXactOffsetSLRULock);
@@ -3192,12 +3205,12 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
* Write an xlog record reflecting the zeroing of either a MEMBERs or
* OFFSETs page (info shows which)
*/
-static void
+static XLogRecPtr
WriteMZeroPageXlogRec(int64 pageno, uint8 info)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_MULTIXACT_ID, info);
+ return XLogInsert(RM_MULTIXACT_ID, info);
}
/*
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 9ac4790f16..c42a44fcaa 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -57,6 +57,7 @@
#include "access/xlogutils.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/shmem.h"
@@ -154,13 +155,13 @@ typedef enum
SLRU_WRITE_FAILED,
SLRU_FSYNC_FAILED,
SLRU_CLOSE_FAILED,
+ SLRU_DATA_CORRUPTED,
} SlruErrorCause;
static SlruErrorCause slru_errcause;
static int slru_errno;
-static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
@@ -179,7 +180,7 @@ static void SlruInternalDeleteSegment(SlruCtl ctl, int64 segno);
*/
Size
-SimpleLruShmemSize(int nslots, int nlsns)
+SimpleLruShmemSize(int nslots)
{
Size sz;
@@ -192,9 +193,6 @@ SimpleLruShmemSize(int nslots, int nlsns)
sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */
sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */
- if (nlsns > 0)
- sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
-
return BUFFERALIGN(sz) + BLCKSZ * nslots;
}
@@ -204,14 +202,13 @@ SimpleLruShmemSize(int nslots, int nlsns)
* ctl: address of local (unshared) control structure.
* name: name of SLRU. (This is user-visible, pick with care!)
* nslots: number of page slots to use.
- * nlsns: number of LSN groups per page (set to zero if not relevant).
* ctllock: LWLock to use to control access to the shared control structure.
* subdir: PGDATA-relative subdirectory that will contain the files.
* tranche_id: LWLock tranche ID to use for the SLRU's per-buffer LWLocks.
* sync_handler: which set of functions to use to handle sync requests
*/
void
-SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler, bool long_segment_names)
{
@@ -219,7 +216,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
bool found;
shared = (SlruShared) ShmemInitStruct(name,
- SimpleLruShmemSize(nslots, nlsns),
+ SimpleLruShmemSize(nslots),
&found);
if (!IsUnderPostmaster)
@@ -236,7 +233,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
shared->ControlLock = ctllock;
shared->num_slots = nslots;
- shared->lsn_groups_per_page = nlsns;
shared->cur_lru_count = 0;
@@ -261,12 +257,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
shared->buffer_locks = (LWLockPadded *) (ptr + offset);
offset += MAXALIGN(nslots * sizeof(LWLockPadded));
- if (nlsns > 0)
- {
- shared->group_lsn = (XLogRecPtr *) (ptr + offset);
- offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
- }
-
ptr += BUFFERALIGN(offset);
for (slotno = 0; slotno < nslots; slotno++)
{
@@ -281,7 +271,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
}
/* Should fit to estimated shmem size */
- Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
+ Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots));
}
else
Assert(found);
@@ -323,11 +313,8 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
shared->page_dirty[slotno] = true;
SlruRecentlyUsed(shared, slotno);
- /* Set the buffer to zeroes */
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
-
- /* Set the LSNs for this new page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
+ /* Initialize the page. */
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
/* Assume this page is now the latest active page */
shared->latest_page_number = pageno;
@@ -338,26 +325,6 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
return slotno;
}
-/*
- * Zero all the LSNs we store for this slru page.
- *
- * This should be called each time we create a new page, and each time we read
- * in a page from disk into an existing buffer. (Such an old page cannot
- * have any interesting LSNs, since we'd have flushed them before writing
- * the page in the first place.)
- *
- * This assumes that InvalidXLogRecPtr is bitwise-all-0.
- */
-static void
-SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
-{
- SlruShared shared = ctl->shared;
-
- if (shared->lsn_groups_per_page > 0)
- MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
- shared->lsn_groups_per_page * sizeof(XLogRecPtr));
-}
-
/*
* Wait for any active I/O on a page slot to finish. (This does not
* guarantee that new I/O hasn't been started before we return, though.
@@ -478,9 +445,6 @@ SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
/* Do the read */
ok = SlruPhysicalReadPage(ctl, pageno, slotno);
- /* Set the LSNs for this newly read-in page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
-
/* Re-acquire control lock and update page state */
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
@@ -740,7 +704,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
ereport(LOG,
(errmsg("file \"%s\" doesn't exist, reading as zeroes",
path)));
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
return true;
}
@@ -763,6 +727,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
return false;
}
+ if (!PageIsVerifiedExtended(shared->page_buffer[slotno], pageno, PIV_REPORT_STAT))
+ {
+ slru_errcause = SLRU_DATA_CORRUPTED;
+ slru_errno = 0;
+ return false;
+ }
+
return true;
}
@@ -789,6 +760,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd = -1;
+ Page page = shared->page_buffer[slotno];
+ XLogRecPtr lsn;
/* update the stats counter of written pages */
pgstat_count_slru_page_written(shared->slru_stats_idx);
@@ -798,41 +771,18 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
* write out data before associated WAL records. This is the same action
* performed during FlushBuffer() in the main buffer manager.
*/
- if (shared->group_lsn != NULL)
+ lsn = PageGetLSN(page);
+ if (!XLogRecPtrIsInvalid(lsn))
{
/*
- * We must determine the largest async-commit LSN for the page. This
- * is a bit tedious, but since this entire function is a slow path
- * anyway, it seems better to do this here than to maintain a per-page
- * LSN variable (which'd need an extra comparison in the
- * transaction-commit path).
+ * As noted above, elog(ERROR) is not acceptable here, so if
+ * XLogFlush were to fail, we must PANIC. This isn't much of a
+ * restriction because XLogFlush is just about all critical
+ * section anyway, but let's make sure.
*/
- XLogRecPtr max_lsn;
- int lsnindex,
- lsnoff;
-
- lsnindex = slotno * shared->lsn_groups_per_page;
- max_lsn = shared->group_lsn[lsnindex++];
- for (lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
- {
- XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
-
- if (max_lsn < this_lsn)
- max_lsn = this_lsn;
- }
-
- if (!XLogRecPtrIsInvalid(max_lsn))
- {
- /*
- * As noted above, elog(ERROR) is not acceptable here, so if
- * XLogFlush were to fail, we must PANIC. This isn't much of a
- * restriction because XLogFlush is just about all critical
- * section anyway, but let's make sure.
- */
- START_CRIT_SECTION();
- XLogFlush(max_lsn);
- END_CRIT_SECTION();
- }
+ START_CRIT_SECTION();
+ XLogFlush(lsn);
+ END_CRIT_SECTION();
}
/*
@@ -899,6 +849,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
}
}
+ PageSetChecksumInplace(shared->page_buffer[slotno], pageno);
+
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
@@ -1019,6 +971,13 @@ SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
errdetail("Could not close file \"%s\": %m.",
path)));
break;
+ case SLRU_DATA_CORRUPTED:
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Invalid page from file \"%s\" at offset %d.",
+ path, offset)));
+ break;
default:
/* can't get here, we trust */
elog(ERROR, "unrecognized SimpleLru error cause: %d",
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index b2ed82ac56..a18b357fe2 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -32,6 +32,7 @@
#include "access/subtrans.h"
#include "access/transam.h"
#include "pg_trace.h"
+#include "storage/bufpage.h"
#include "utils/snapmgr.h"
@@ -49,7 +50,7 @@
*/
/* We need four bytes per xact */
-#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+#define SUBTRANS_XACTS_PER_PAGE (SizeOfPageContents / sizeof(TransactionId))
/*
* Although we return an int64 the actual value can't currently exceed
@@ -93,7 +94,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
/*
@@ -133,7 +134,7 @@ SubTransGetParent(TransactionId xid)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
parent = *ptr;
@@ -193,14 +194,14 @@ SubTransGetTopmostTransaction(TransactionId xid)
Size
SUBTRANSShmemSize(void)
{
- return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
+ return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS);
}
void
SUBTRANSShmemInit(void)
{
SubTransCtl->PagePrecedes = SubTransPagePrecedes;
- SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
+ SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS,
SubtransSLRULock, "pg_subtrans",
LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE,
false);
@@ -305,7 +306,6 @@ CheckPointSUBTRANS(void)
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that SUBTRANS has room for a newly-allocated XID.
*
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 8b24b22293..d3b8d54903 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -140,6 +140,7 @@
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
@@ -162,7 +163,7 @@
* than that, so changes in that data structure won't affect user-visible
* restrictions.
*/
-#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128)
+#define NOTIFY_PAYLOAD_MAX_LENGTH (SizeOfPageContents - NAMEDATALEN - 128)
/*
* Struct representing an entry in the global notify queue
@@ -311,7 +312,7 @@ static SlruCtlData NotifyCtlData;
#define NotifyCtl (&NotifyCtlData)
#define QUEUE_PAGESIZE BLCKSZ
-
+#define QUEUE_PAGE_CAPACITY (QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData))
#define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */
/*
@@ -492,7 +493,7 @@ AsyncShmemSize(void)
size = mul_size(MaxBackends + 1, sizeof(QueueBackendStatus));
size = add_size(size, offsetof(AsyncQueueControl, backend));
- size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS));
return size;
}
@@ -541,7 +542,7 @@ AsyncShmemInit(void)
* names are used in order to avoid wraparound.
*/
NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
- SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0,
+ SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS,
NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER,
SYNC_HANDLER_NONE, true);
@@ -1301,14 +1302,14 @@ asyncQueueAdvance(volatile QueuePosition *position, int entryLength)
* written or read.
*/
offset += entryLength;
- Assert(offset <= QUEUE_PAGESIZE);
+ Assert(offset <= QUEUE_PAGE_CAPACITY);
/*
* In a second step check if another entry can possibly be written to the
* page. If so, stay here, we have reached the next position. If not, then
* we need to move on to the next page.
*/
- if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGESIZE)
+ if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGE_CAPACITY)
{
pageno++;
offset = 0;
@@ -1408,7 +1409,7 @@ asyncQueueAddEntries(ListCell *nextNotify)
offset = QUEUE_POS_OFFSET(queue_head);
/* Check whether the entry really fits on the current page */
- if (offset + qe.length <= QUEUE_PAGESIZE)
+ if (offset + qe.length <= QUEUE_PAGE_CAPACITY)
{
/* OK, so advance nextNotify past this item */
nextNotify = lnext(pendingNotifies->events, nextNotify);
@@ -1420,14 +1421,14 @@ asyncQueueAddEntries(ListCell *nextNotify)
* only check dboid and since it won't match any reader's database
* OID, they will ignore this entry and move on.
*/
- qe.length = QUEUE_PAGESIZE - offset;
+ qe.length = QUEUE_PAGE_CAPACITY - offset;
qe.dboid = InvalidOid;
qe.data[0] = '\0'; /* empty channel */
qe.data[1] = '\0'; /* empty payload */
}
/* Now copy qe into the shared buffer page */
- memcpy(NotifyCtl->shared->page_buffer[slotno] + offset,
+ memcpy(PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + offset,
&qe,
qe.length);
@@ -1947,10 +1948,10 @@ asyncQueueReadAllNotifications(void)
else
{
/* fetch all the rest of the page */
- copysize = QUEUE_PAGESIZE - curoffset;
+ copysize = QUEUE_PAGE_CAPACITY - curoffset;
}
- memcpy(page_buffer.buf + curoffset,
- NotifyCtl->shared->page_buffer[slotno] + curoffset,
+ memcpy(PageGetContents(page_buffer.buf) + curoffset,
+ PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + curoffset,
copysize);
/* Release lock that we got from SimpleLruReadPage_ReadOnly() */
LWLockRelease(NotifySLRULock);
@@ -2021,7 +2022,7 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current,
if (QUEUE_POS_EQUAL(thisentry, stop))
break;
- qe = (AsyncQueueEntry *) (page_buffer + QUEUE_POS_OFFSET(thisentry));
+ qe = (AsyncQueueEntry *) (PageGetContents(page_buffer) + QUEUE_POS_OFFSET(thisentry));
/*
* Advance *current over this message, possibly to the next page. As
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index ee5ea1175c..ba9ecbe524 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -204,6 +204,7 @@
#include "pgstat.h"
#include "port/pg_lfind.h"
#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
#include "storage/predicate.h"
#include "storage/predicate_internals.h"
#include "storage/proc.h"
@@ -322,8 +323,8 @@ static SlruCtlData SerialSlruCtlData;
#define SerialSlruCtl (&SerialSlruCtlData)
#define SERIAL_PAGESIZE BLCKSZ
-#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
-#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE - MAXALIGN(SizeOfPageHeaderData) / SERIAL_ENTRYSIZE)
/*
* Set maximum pages based on the number needed to track all transactions.
@@ -333,7 +334,7 @@ static SlruCtlData SerialSlruCtlData;
#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
- (SerialSlruCtl->shared->page_buffer[slotno] + \
+ (PageGetContents(SerialSlruCtl->shared->page_buffer[slotno]) + \
((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
@@ -785,10 +786,13 @@ SerialPagePrecedesLogicallyUnitTests(void)
* requires burning ~2B XIDs in single-user mode, a negligible
* possibility. Moreover, if it does happen, the consequence would be
* mild, namely a new transaction failing in SimpleLruReadPage().
+ *
+ * NOTE: After adding the page header, the defect affects two pages.
+ * We now assert correct treatment of its second to prior page.
*/
headPage = oldestPage;
targetPage = newestPage;
- Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+ Assert(SerialPagePrecedesLogically(headPage, targetPage - 2));
#if 0
Assert(SerialPagePrecedesLogically(headPage, targetPage));
#endif
@@ -808,7 +812,7 @@ SerialInit(void)
*/
SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
SimpleLruInit(SerialSlruCtl, "Serial",
- NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
+ NUM_SERIAL_BUFFERS, SerialSLRULock, "pg_serial",
LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE,
false);
#ifdef USE_ASSERT_CHECKING
@@ -1348,7 +1352,7 @@ PredicateLockShmemSize(void)
/* Shared memory structures for SLRU tracking of old committed xids. */
size = add_size(size, sizeof(SerialControlData));
- size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
+ size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS));
return size;
}
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index be6f1f62d2..e8193d7f56 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -59,6 +59,31 @@ PageInit(Page page, Size pageSize, Size specialSize)
/* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
}
+/*
+ * PageInitSLRU
+ * Initializes the contents of an SLRU page.
+ * Note that we don't calculate an initial checksum here; that's not done
+ * until it's time to write.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
/*
* PageIsVerifiedExtended
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 6fc1326418..7e6596b8fd 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -16,6 +16,7 @@
#include <dirent.h>
#include <limits.h>
+#include <stdbool.h>
#include <time.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -589,12 +590,20 @@ main(int argc, char *argv[])
{
total_size = scan_directory(DataDir, "global", true);
total_size += scan_directory(DataDir, "base", true);
+ total_size += scan_directory(DataDir, "pg_commit_ts", true);
+ total_size += scan_directory(DataDir, "pg_multixact", true);
+ total_size += scan_directory(DataDir, "pg_serial", true);
total_size += scan_directory(DataDir, "pg_tblspc", true);
+ total_size += scan_directory(DataDir, "pg_xact", true);
}
(void) scan_directory(DataDir, "global", false);
(void) scan_directory(DataDir, "base", false);
+ (void) scan_directory(DataDir, "pg_commit_ts", false);
+ (void) scan_directory(DataDir, "pg_multixact", false);
+ (void) scan_directory(DataDir, "pg_serial", false);
(void) scan_directory(DataDir, "pg_tblspc", false);
+ (void) scan_directory(DataDir, "pg_xact", false);
if (showprogress)
progress_report(true);
diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl
index 9829e48106..7b9e034e19 100644
--- a/src/bin/pg_resetwal/t/001_basic.pl
+++ b/src/bin/pg_resetwal/t/001_basic.pl
@@ -206,7 +206,7 @@ push @cmd,
sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1]));
@files = get_slru_files('pg_multixact/offsets');
-$mult = 32 * $blcksz / 4;
+$mult = 32 * ($blcksz - 24) / 4;
# -m argument is "new,old"
push @cmd, '-m',
sprintf("%d,%d",
@@ -214,11 +214,11 @@ push @cmd, '-m',
hex($files[0]) == 0 ? 1 : hex($files[0] * $mult));
@files = get_slru_files('pg_multixact/members');
-$mult = 32 * int($blcksz / 20) * 4;
+$mult = 32 * int(($blcksz - 24) / 20) * 4;
push @cmd, '-O', (hex($files[-1]) + 1) * $mult;
@files = get_slru_files('pg_xact');
-$mult = 32 * $blcksz * 4;
+$mult = 32 * ($blcksz - 24) * 4;
push @cmd,
'-u', (hex($files[0]) == 0 ? 3 : hex($files[0]) * $mult),
'-x', ((hex($files[-1]) + 1) * $mult);
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index 4850a682cb..64184cecdb 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -9,6 +9,7 @@
#include "postgres_fe.h"
+#include <dirent.h>
#include <sys/stat.h>
#include <fcntl.h>
#ifdef HAVE_COPYFILE_H
@@ -375,3 +376,174 @@ check_hard_link(void)
unlink(new_link_file);
}
+
+
+/*
+ * Copy SLRU_PAGES_PER_SEGMENT from access/slru.h to avoid including it.
+ */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+#define SEGMENT_SIZE (BLCKSZ * SLRU_PAGES_PER_SEGMENT)
+
+/*
+ * Copy PageInitSLRU from storage/bufpage.c to avoid linking to the backend.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
+
+/*
+ * Filter function for scandir(3) to select only segment files.
+ */
+static int
+segment_file_filter(const struct dirent *dirent)
+{
+ return strspn(dirent->d_name, "0123456789ABCDEF") == strlen(dirent->d_name);
+}
+
+static void
+upgrade_file(const char *src_dir, const char *src_file, const char *dst_dir)
+{
+ char src[MAXPGPATH];
+ char dst[MAXPGPATH];
+
+ int seg_name_len;
+ int src_segno;
+ int64 src_pageno;
+ int dst_segno;
+ int64 dst_pageno;
+ int dst_offset;
+
+ int src_fd;
+ int dst_fd;
+
+ char *src_buf;
+ ssize_t src_len;
+ ssize_t src_buf_offset;
+ PGAlignedBlock dst_block;
+ Page page = dst_block.data;
+ int len_to_copy;
+
+ seg_name_len = strlen(src_file);
+ src_segno = (int) strtol(src_file, NULL, 16);
+ src_pageno = src_segno * SLRU_PAGES_PER_SEGMENT;
+
+ dst_pageno = src_pageno * BLCKSZ / SizeOfPageContents;
+ dst_offset = src_pageno * BLCKSZ - dst_pageno * SizeOfPageContents;
+ dst_segno = dst_pageno / SLRU_PAGES_PER_SEGMENT;
+
+ snprintf(src, sizeof(src), "%s/%s", src_dir, src_file);
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+
+ src_buf = pg_malloc(SEGMENT_SIZE);
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) == -1)
+ pg_fatal("could not open file \"%s\": %s", src, strerror(errno));
+ if ((src_len = read(src_fd, src_buf, SEGMENT_SIZE)) == -1)
+ pg_fatal("could not read file \"%s\": %s", src, strerror(errno));
+
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+
+ /*
+ * Read the destination page at dst_pageno into the buffer. The page may contain
+ * data from the previous source segment. Initialize the page if the page is new.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+ if (read(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not read file \"%s\": %s", dst, strerror(errno));
+ if (PageIsNew(page))
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Rewind the file position, so the first write will overwrite the page.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+
+ src_buf_offset = 0;
+ while (src_buf_offset < src_len)
+ {
+ len_to_copy = Min(src_len - src_buf_offset, SizeOfPageContents - dst_offset);
+ memcpy(PageGetContents(page) + dst_offset, src_buf + src_buf_offset, len_to_copy);
+ src_buf_offset += len_to_copy;
+
+ if (new_cluster.controldata.data_checksum_version > 0)
+ ((PageHeader) page)->pd_checksum = pg_checksum_page(page, dst_pageno);
+ if (write(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not write file \"%s\": %s", dst, strerror(errno));
+
+ dst_pageno++;
+ dst_offset = 0;
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Switch segments if we reached the end of the current segment.
+ */
+ if (dst_pageno % SLRU_PAGES_PER_SEGMENT == 0)
+ {
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ dst_segno++;
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+ }
+ }
+
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ pg_free(src_buf);
+ close(src_fd);
+}
+
+void
+upgrade_xact_cache(const char *src_subdir, const char *dst_subdir)
+{
+ char src_dir[MAXPGPATH];
+ char dst_dir[MAXPGPATH];
+
+ DIR *src_dirp;
+ struct dirent *src_dirent;
+
+ snprintf(src_dir, sizeof(src_dir), "%s/%s", old_cluster.pgdata, src_subdir);
+ snprintf(dst_dir, sizeof(dst_dir), "%s/%s", new_cluster.pgdata, dst_subdir);
+
+ if ((src_dirp = opendir(src_dir)) == NULL)
+ pg_fatal("could not open directory \"%s\": %s", src_dir, strerror(errno));
+
+ while (errno = 0, (src_dirent = readdir(src_dirp)) != NULL)
+ {
+ if (segment_file_filter(src_dirent))
+ upgrade_file(src_dir, src_dirent->d_name, dst_dir);
+ }
+
+ if (closedir(src_dirp) != 0)
+ pg_fatal("could not close directory \"%s\": %s", src_dir, strerror(errno));
+}
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 14a36f0503..f630c5b04a 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -660,14 +660,23 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir)
static void
copy_xact_xlog_xid(void)
{
+ bool slru_header_changed = false;
+
/*
* Copy old commit logs to new data dir. pg_clog has been renamed to
* pg_xact in post-10 clusters.
*/
- copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact",
- GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact");
+ char *xact_old_dir = GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+ char *xact_new_dir = GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+
+ if (new_cluster.controldata.cat_ver >= SLRU_PAGE_HEADER_CAT_VER &&
+ old_cluster.controldata.cat_ver < SLRU_PAGE_HEADER_CAT_VER)
+ slru_header_changed = true;
+
+ if (slru_header_changed)
+ upgrade_xact_cache(xact_old_dir, xact_new_dir);
+ else
+ copy_subdir_files(xact_old_dir, xact_new_dir);
prep_status("Setting oldest XID for new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
@@ -702,7 +711,8 @@ copy_xact_xlog_xid(void)
* server doesn't attempt to read multis older than the cutoff value.
*/
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
- new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
+ !slru_header_changed)
{
copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
copy_subdir_files("pg_multixact/members", "pg_multixact/members");
@@ -722,7 +732,8 @@ copy_xact_xlog_xid(void)
new_cluster.pgdata);
check_ok();
}
- else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER ||
+ slru_header_changed)
{
/*
* Remove offsets/0000 file created by initdb that no longer matches
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index a1d08c3dab..edec949378 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -115,6 +115,11 @@ extern char *output_files[];
*/
#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
+/*
+ * A page header was added to each SLRU page in 17.0.
+ */
+#define SLRU_PAGE_HEADER_CAT_VER 202312091
+
/*
* large object chunk size added to pg_controldata,
* commit 5f93c37805e7485488480916b4585e098d3cc883
@@ -406,6 +411,7 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile,
const char *schemaName, const char *relName);
void check_file_clone(void);
void check_hard_link(void);
+void upgrade_xact_cache(const char *src_subdir, const char *dst_subdir);
/* fopen_priv() is no longer different from fopen() */
#define fopen_priv(path, mode) fopen(path, mode)
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index b05f6bc71d..4684389fda 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -68,17 +68,6 @@ typedef struct SlruSharedData
int *page_lru_count;
LWLockPadded *buffer_locks;
- /*
- * Optional array of WAL flush LSNs associated with entries in the SLRU
- * pages. If not zero/NULL, we must flush WAL before writing pages (true
- * for pg_xact, false for multixact, pg_subtrans, pg_notify). group_lsn[]
- * has lsn_groups_per_page entries per buffer slot, each containing the
- * highest LSN known for a contiguous group of SLRU entries on that slot's
- * page.
- */
- XLogRecPtr *group_lsn;
- int lsn_groups_per_page;
-
/*----------
* We mark a page "most recently used" by setting
* page_lru_count[slotno] = ++cur_lru_count;
@@ -147,8 +136,8 @@ typedef struct SlruCtlData
typedef SlruCtlData *SlruCtl;
-extern Size SimpleLruShmemSize(int nslots, int nlsns);
-extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+extern Size SimpleLruShmemSize(int nslots);
+extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
LWLock *ctllock, const char *subdir, int tranche_id,
SyncRequestHandler sync_handler,
bool long_segment_names);
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 460d80ac97..a06f2f0776 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202401131
+#define CATALOG_VERSION_NO 202401161
#endif
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index d0df02d39c..2dc83451a6 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -201,6 +201,7 @@ typedef PageHeaderData *PageHeader;
* handling pages.
*/
#define PG_PAGE_LAYOUT_VERSION 4
+#define PG_SLRU_PAGE_LAYOUT_VERSION 1
#define PG_DATA_CHECKSUM_VERSION 1
/* ----------------------------------------------------------------
@@ -257,6 +258,11 @@ PageGetContents(Page page)
return (char *) page + MAXALIGN(SizeOfPageHeaderData);
}
+/*
+ * Space available for storing page contents.
+ */
+#define SizeOfPageContents (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
/* ----------------
* functions to access page size info
* ----------------
@@ -486,6 +492,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
"BLCKSZ has to be a multiple of sizeof(size_t)");
extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern void PageInitSLRU(Page page, Size pageSize, Size specialSize);
extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags);
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
OffsetNumber offsetNumber, int flags);
diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c
index 4b31f331ca..4e6ca07b6a 100644
--- a/src/test/modules/test_slru/test_slru.c
+++ b/src/test/modules/test_slru/test_slru.c
@@ -17,6 +17,7 @@
#include "access/slru.h"
#include "access/transam.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
@@ -76,8 +77,8 @@ test_slru_page_write(PG_FUNCTION_ARGS)
TestSlruCtl->shared->page_status[slotno] = SLRU_PAGE_VALID;
/* write given data to the page, up to the limit of the page */
- strncpy(TestSlruCtl->shared->page_buffer[slotno], data,
- BLCKSZ - 1);
+ strncpy(PageGetContents(TestSlruCtl->shared->page_buffer[slotno]), data,
+ SizeOfPageContents - 1);
SimpleLruWritePage(TestSlruCtl, slotno);
LWLockRelease(TestSLRULock);
@@ -104,7 +105,7 @@ test_slru_page_read(PG_FUNCTION_ARGS)
LWLockAcquire(TestSLRULock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(TestSlruCtl, pageno,
write_ok, InvalidTransactionId);
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(TestSLRULock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -122,7 +123,7 @@ test_slru_page_readonly(PG_FUNCTION_ARGS)
pageno,
InvalidTransactionId);
Assert(LWLockHeldByMe(TestSLRULock));
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(TestSLRULock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -202,7 +203,7 @@ test_slru_shmem_request(void)
prev_shmem_request_hook();
/* reserve shared memory for the test SLRU */
- RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS, 0));
+ RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS));
}
static bool
@@ -238,7 +239,7 @@ test_slru_shmem_startup(void)
TestSlruCtl->PagePrecedes = test_slru_page_precedes_logically;
SimpleLruInit(TestSlruCtl, "TestSLRU",
- NUM_TEST_BUFFERS, 0, TestSLRULock, slru_dir_name,
+ NUM_TEST_BUFFERS, TestSLRULock, slru_dir_name,
test_tranche_id, SYNC_HANDLER_NONE, long_segment_names);
}
Rebase the patch against the latest HEAD.
Regards,
Yong
Attachments:
slru_page_header_v5.patchapplication/octet-stream; name=slru_page_header_v5.patchDownload
src/backend/access/transam/clog.c | 52 +++++-----
src/backend/access/transam/commit_ts.c | 26 +++--
src/backend/access/transam/multixact.c | 63 +++++++-----
src/backend/access/transam/slru.c | 112 +++++++--------------
src/backend/access/transam/subtrans.c | 12 +--
src/backend/commands/async.c | 27 +++---
src/backend/storage/lmgr/predicate.c | 16 +--
src/backend/storage/page/bufpage.c | 25 +++++
src/bin/pg_checksums/pg_checksums.c | 9 ++
src/bin/pg_resetwal/t/001_basic.pl | 6 +-
src/bin/pg_upgrade/file.c | 172 +++++++++++++++++++++++++++++++++
src/bin/pg_upgrade/pg_upgrade.c | 23 +++--
src/bin/pg_upgrade/pg_upgrade.h | 6 ++
src/include/access/slru.h | 15 +--
src/include/catalog/catversion.h | 2 +-
src/include/storage/bufpage.h | 7 ++
src/test/modules/test_slru/test_slru.c | 13 +--
17 files changed, 395 insertions(+), 191 deletions(-)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 44c253246b..4ed4c82f05 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -42,6 +42,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/proc.h"
#include "storage/sync.h"
#include "utils/guc_hooks.h"
@@ -61,7 +62,7 @@
/* We need two bits per xact, so four xacts fit in a byte */
#define CLOG_BITS_PER_XACT 2
#define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACTS_PER_PAGE (SizeOfPageContents * CLOG_XACTS_PER_BYTE)
#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
/*
@@ -88,13 +89,6 @@ TransactionIdToPage(TransactionId xid)
#define TransactionIdToByte(xid) (TransactionIdToPgIndex(xid) / CLOG_XACTS_PER_BYTE)
#define TransactionIdToBIndex(xid) ((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
-/* We store the latest async LSN for each group of transactions */
-#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
-#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
-
-#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
- ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
-
/*
* The number of subtransactions below which we consider to apply clog group
* update optimization. Testing reveals that the number higher than this can
@@ -112,7 +106,7 @@ static SlruCtlData XactCtlData;
static int ZeroCLOGPage(int64 pageno, bool writeXlog);
static bool CLOGPagePrecedes(int64 page1, int64 page2);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
Oid oldestXactDb);
static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
@@ -670,8 +664,9 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl,
XactCtl->shared->page_number[slotno]),
LW_EXCLUSIVE));
+ Page page = XactCtl->shared->page_buffer[slotno];
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(page) + byteno;
curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
/*
@@ -700,7 +695,7 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*byteptr = byteval;
/*
- * Update the group LSN if the transaction completion LSN is higher.
+ * Update the page LSN if the transaction completion LSN is higher.
*
* Note: lsn will be invalid when supplied during InRecovery processing,
* so we don't need to do anything special to avoid LSN updates during
@@ -709,10 +704,8 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*/
if (!XLogRecPtrIsInvalid(lsn))
{
- int lsnindex = GetLSNIndex(slotno, xid);
-
- if (XactCtl->shared->group_lsn[lsnindex] < lsn)
- XactCtl->shared->group_lsn[lsnindex] = lsn;
+ if (PageGetLSN(page) < lsn)
+ PageSetLSN(page, lsn);
}
}
@@ -738,19 +731,19 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
int byteno = TransactionIdToByte(xid);
int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
int slotno;
- int lsnindex;
+ Page page;
char *byteptr;
XidStatus status;
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
- lsnindex = GetLSNIndex(slotno, xid);
- *lsn = XactCtl->shared->group_lsn[lsnindex];
+ *lsn = PageGetLSN(page);
LWLockRelease(SimpleLruGetBankLock(XactCtl, pageno));
@@ -780,7 +773,7 @@ CLOGShmemBuffers(void)
Size
CLOGShmemSize(void)
{
- return SimpleLruShmemSize(CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE);
+ return SimpleLruShmemSize(CLOGShmemBuffers());
}
void
@@ -808,7 +801,7 @@ CLOGShmemInit(void)
Assert(transaction_buffers != 0);
XactCtl->PagePrecedes = CLOGPagePrecedes;
- SimpleLruInit(XactCtl, "transaction", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
+ SimpleLruInit(XactCtl, "transaction", CLOGShmemBuffers(),
"pg_xact", LWTRANCHE_XACT_BUFFER,
LWTRANCHE_XACT_SLRU, SYNC_HANDLER_CLOG, false);
SlruPagePrecedesUnitTests(XactCtl, CLOG_XACTS_PER_PAGE);
@@ -860,11 +853,17 @@ static int
ZeroCLOGPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(XactCtl, pageno);
+ page = XactCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -917,12 +916,12 @@ TrimCLOG(void)
char *byteptr;
slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(XactCtl->shared->page_buffer[slotno]) + byteno;
/* Zero so-far-unused positions in the current byte */
*byteptr &= (1 << bshift) - 1;
/* Zero the rest of the page */
- MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+ MemSet(byteptr + 1, 0, SizeOfPageContents - byteno - 1);
XactCtl->shared->page_dirty[slotno] = true;
}
@@ -946,7 +945,6 @@ CheckPointCLOG(void)
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that CLOG has room for a newly-allocated XID.
*
@@ -1070,12 +1068,12 @@ CLOGPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
+ return XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index f221494687..dce06d6cf6 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -29,6 +29,7 @@
#include "access/xlogutils.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/shmem.h"
#include "utils/fmgrprotos.h"
#include "utils/guc_hooks.h"
@@ -61,7 +62,7 @@ typedef struct CommitTimestampEntry
sizeof(RepOriginId))
#define COMMIT_TS_XACTS_PER_PAGE \
- (BLCKSZ / SizeOfCommitTimestampEntry)
+ (SizeOfPageContents / SizeOfCommitTimestampEntry)
/*
@@ -118,7 +119,7 @@ static int ZeroCommitTsPage(int64 pageno, bool writeXlog);
static bool CommitTsPagePrecedes(int64 page1, int64 page2);
static void ActivateCommitTs(void);
static void DeactivateCommitTs(void);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid);
/*
@@ -253,11 +254,12 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
CommitTimestampEntry entry;
Assert(TransactionIdIsNormal(xid));
+ Assert(xid == slotno * COMMIT_TS_XACTS_PER_PAGE + entryno);
entry.time = ts;
entry.nodeid = nodeid;
- memcpy(CommitTsCtl->shared->page_buffer[slotno] +
+ memcpy(PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
&entry, SizeOfCommitTimestampEntry);
}
@@ -336,7 +338,7 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
memcpy(&entry,
- CommitTsCtl->shared->page_buffer[slotno] +
+ PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
SizeOfCommitTimestampEntry);
@@ -518,7 +520,7 @@ CommitTsShmemBuffers(void)
Size
CommitTsShmemSize(void)
{
- return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
+ return SimpleLruShmemSize(CommitTsShmemBuffers()) +
sizeof(CommitTimestampShared);
}
@@ -553,7 +555,7 @@ CommitTsShmemInit(void)
Assert(commit_timestamp_buffers != 0);
CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
- SimpleLruInit(CommitTsCtl, "commit_timestamp", CommitTsShmemBuffers(), 0,
+ SimpleLruInit(CommitTsCtl, "commit_timestamp", CommitTsShmemBuffers(),
"pg_commit_ts", LWTRANCHE_COMMITTS_BUFFER,
LWTRANCHE_COMMITTS_SLRU,
SYNC_HANDLER_COMMIT_TS,
@@ -615,11 +617,17 @@ static int
ZeroCommitTsPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+ page = CommitTsCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -985,12 +993,12 @@ CommitTsPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
+ return XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 83b578dced..8d934c7bb3 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -83,6 +83,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "postmaster/autovacuum.h"
+#include "storage/bufpage.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "storage/procarray.h"
@@ -105,7 +106,7 @@
*/
/* We need four bytes per offset */
-#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+#define MULTIXACT_OFFSETS_PER_PAGE (SizeOfPageContents / sizeof(MultiXactOffset))
#define MultiXactIdToOffsetPage(xid) \
((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
@@ -118,8 +119,8 @@
* additional flag bits for each TransactionId. To do this without getting
* into alignment issues, we store four bytes of flags, and then the
* corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
- * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
- * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 408 groups
+ * per page. This wastes 8 bytes per page, but that's OK -- simplicity (and
* performance) trumps space efficiency here.
*
* Note that the "offset" macros work with byte offset, not array indexes, so
@@ -137,7 +138,7 @@
/* size in bytes of a complete group */
#define MULTIXACT_MEMBERGROUP_SIZE \
(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
-#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (SizeOfPageContents / MULTIXACT_MEMBERGROUP_SIZE)
#define MULTIXACT_MEMBERS_PER_PAGE \
(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
@@ -364,7 +365,7 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
MultiXactOffset start, uint32 distance);
static bool SetOffsetVacuumLimit(bool is_startup);
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
-static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
+static XLogRecPtr WriteMZeroPageXlogRec(int64 pageno, uint8 info);
static void WriteMTruncateXlogRec(Oid oldestMultiDB,
MultiXactId startTruncOff,
MultiXactId endTruncOff,
@@ -885,7 +886,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
* take the trouble to generalize the slru.c error reporting code.
*/
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
*offptr = offset;
@@ -934,12 +935,12 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
}
memberptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
*memberptr = members[i].xid;
flagsptr = (uint32 *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
flagsval = *flagsptr;
flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
@@ -1364,7 +1365,7 @@ retry:
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
@@ -1413,7 +1414,7 @@ retry:
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
}
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
nextMXOffset = *offptr;
@@ -1470,7 +1471,7 @@ retry:
}
xactptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
if (!TransactionIdIsValid(*xactptr))
{
@@ -1481,7 +1482,7 @@ retry:
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
- flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ flagsptr = (uint32 *) (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
ptr[truelength].xid = *xactptr;
ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
@@ -1880,8 +1881,8 @@ MultiXactShmemSize(void)
mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
size = SHARED_MULTIXACT_STATE_SIZE;
- size = add_size(size, SimpleLruShmemSize(multixact_offset_buffers, 0));
- size = add_size(size, SimpleLruShmemSize(multixact_member_buffers, 0));
+ size = add_size(size, SimpleLruShmemSize(multixact_offset_buffers));
+ size = add_size(size, SimpleLruShmemSize(multixact_member_buffers));
return size;
}
@@ -1897,14 +1898,14 @@ MultiXactShmemInit(void)
MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
SimpleLruInit(MultiXactOffsetCtl,
- "multixact_offset", multixact_offset_buffers, 0,
+ "multixact_offset", multixact_offset_buffers,
"pg_multixact/offsets", LWTRANCHE_MULTIXACTOFFSET_BUFFER,
LWTRANCHE_MULTIXACTOFFSET_SLRU,
SYNC_HANDLER_MULTIXACT_OFFSET,
false);
SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
SimpleLruInit(MultiXactMemberCtl,
- "multixact_member", multixact_member_buffers, 0,
+ "multixact_member", multixact_member_buffers,
"pg_multixact/members", LWTRANCHE_MULTIXACTMEMBER_BUFFER,
LWTRANCHE_MULTIXACTMEMBER_SLRU,
SYNC_HANDLER_MULTIXACT_MEMBER,
@@ -1999,11 +2000,17 @@ static int
ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+ page = MultiXactOffsetCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2015,11 +2022,17 @@ static int
ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
+ page = MultiXactMemberCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2143,10 +2156,10 @@ TrimMultiXact(void)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
- MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+ MemSet(offptr, 0, SizeOfPageContents - (entryno * sizeof(MultiXactOffset)));
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
LWLockRelease(lock);
@@ -2177,9 +2190,9 @@ TrimMultiXact(void)
memberoff = MXOffsetToMemberOffset(offset);
slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
xidptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
- MemSet(xidptr, 0, BLCKSZ - memberoff);
+ MemSet(xidptr, 0, SizeOfPageContents - memberoff);
/*
* Note: we don't need to zero out the flag bits in the remaining
@@ -2834,7 +2847,7 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno));
@@ -3268,12 +3281,12 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
* Write an xlog record reflecting the zeroing of either a MEMBERs or
* OFFSETs page (info shows which)
*/
-static void
+static XLogRecPtr
WriteMZeroPageXlogRec(int64 pageno, uint8 info)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_MULTIXACT_ID, info);
+ return XLogInsert(RM_MULTIXACT_ID, info);
}
/*
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index f99ec38a4a..aded4d9708 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -68,6 +68,7 @@
#include "access/xlogutils.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/shmem.h"
#include "utils/guc_hooks.h"
@@ -155,13 +156,13 @@ typedef enum
SLRU_WRITE_FAILED,
SLRU_FSYNC_FAILED,
SLRU_CLOSE_FAILED,
+ SLRU_DATA_CORRUPTED,
} SlruErrorCause;
static SlruErrorCause slru_errcause;
static int slru_errno;
-static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
static bool SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno);
@@ -181,7 +182,7 @@ static inline void SlruRecentlyUsed(SlruShared shared, int slotno);
*/
Size
-SimpleLruShmemSize(int nslots, int nlsns)
+SimpleLruShmemSize(int nslots)
{
int nbanks = nslots / SLRU_BANK_SIZE;
Size sz;
@@ -200,9 +201,6 @@ SimpleLruShmemSize(int nslots, int nlsns)
sz += MAXALIGN(nbanks * sizeof(LWLockPadded)); /* bank_locks[] */
sz += MAXALIGN(nbanks * sizeof(int)); /* bank_cur_lru_count[] */
- if (nlsns > 0)
- sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */
-
return BUFFERALIGN(sz) + BLCKSZ * nslots;
}
@@ -227,7 +225,6 @@ SimpleLruAutotuneBuffers(int divisor, int max)
* ctl: address of local (unshared) control structure.
* name: name of SLRU. (This is user-visible, pick with care!)
* nslots: number of page slots to use.
- * nlsns: number of LSN groups per page (set to zero if not relevant).
* ctllock: LWLock to use to control access to the shared control structure.
* subdir: PGDATA-relative subdirectory that will contain the files.
* buffer_tranche_id: tranche ID to use for the SLRU's per-buffer LWLocks.
@@ -235,7 +232,7 @@ SimpleLruAutotuneBuffers(int divisor, int max)
* sync_handler: which set of functions to use to handle sync requests
*/
void
-SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
const char *subdir, int buffer_tranche_id, int bank_tranche_id,
SyncRequestHandler sync_handler, bool long_segment_names)
{
@@ -246,7 +243,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
Assert(nslots <= SLRU_MAX_ALLOWED_BUFFERS);
shared = (SlruShared) ShmemInitStruct(name,
- SimpleLruShmemSize(nslots, nlsns),
+ SimpleLruShmemSize(nslots),
&found);
if (!IsUnderPostmaster)
@@ -260,7 +257,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
memset(shared, 0, sizeof(SlruSharedData));
shared->num_slots = nslots;
- shared->lsn_groups_per_page = nlsns;
pg_atomic_init_u64(&shared->latest_page_number, 0);
@@ -287,12 +283,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
shared->bank_cur_lru_count = (int *) (ptr + offset);
offset += MAXALIGN(nbanks * sizeof(int));
- if (nlsns > 0)
- {
- shared->group_lsn = (XLogRecPtr *) (ptr + offset);
- offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));
- }
-
ptr += BUFFERALIGN(offset);
for (int slotno = 0; slotno < nslots; slotno++)
{
@@ -314,7 +304,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
}
/* Should fit to estimated shmem size */
- Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
+ Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots));
}
else
{
@@ -378,11 +368,8 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
shared->page_dirty[slotno] = true;
SlruRecentlyUsed(shared, slotno);
- /* Set the buffer to zeroes */
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
-
- /* Set the LSNs for this new page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
+ /* Initialize the page. */
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
/*
* Assume this page is now the latest active page.
@@ -400,26 +387,6 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
return slotno;
}
-/*
- * Zero all the LSNs we store for this slru page.
- *
- * This should be called each time we create a new page, and each time we read
- * in a page from disk into an existing buffer. (Such an old page cannot
- * have any interesting LSNs, since we'd have flushed them before writing
- * the page in the first place.)
- *
- * This assumes that InvalidXLogRecPtr is bitwise-all-0.
- */
-static void
-SimpleLruZeroLSNs(SlruCtl ctl, int slotno)
-{
- SlruShared shared = ctl->shared;
-
- if (shared->lsn_groups_per_page > 0)
- MemSet(&shared->group_lsn[slotno * shared->lsn_groups_per_page], 0,
- shared->lsn_groups_per_page * sizeof(XLogRecPtr));
-}
-
/*
* Wait for any active I/O on a page slot to finish. (This does not
* guarantee that new I/O hasn't been started before we return, though.
@@ -546,9 +513,6 @@ SimpleLruReadPage(SlruCtl ctl, int64 pageno, bool write_ok,
/* Do the read */
ok = SlruPhysicalReadPage(ctl, pageno, slotno);
- /* Set the LSNs for this newly read-in page to zero */
- SimpleLruZeroLSNs(ctl, slotno);
-
/* Re-acquire bank control lock and update page state */
LWLockAcquire(banklock, LW_EXCLUSIVE);
@@ -815,7 +779,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
ereport(LOG,
(errmsg("file \"%s\" doesn't exist, reading as zeroes",
path)));
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
return true;
}
@@ -838,6 +802,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
return false;
}
+ if (!PageIsVerifiedExtended(shared->page_buffer[slotno], pageno, PIV_REPORT_STAT))
+ {
+ slru_errcause = SLRU_DATA_CORRUPTED;
+ slru_errno = 0;
+ return false;
+ }
+
return true;
}
@@ -864,6 +835,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd = -1;
+ Page page = shared->page_buffer[slotno];
+ XLogRecPtr lsn;
/* update the stats counter of written pages */
pgstat_count_slru_page_written(shared->slru_stats_idx);
@@ -873,40 +846,18 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
* write out data before associated WAL records. This is the same action
* performed during FlushBuffer() in the main buffer manager.
*/
- if (shared->group_lsn != NULL)
+ lsn = PageGetLSN(page);
+ if (!XLogRecPtrIsInvalid(lsn))
{
/*
- * We must determine the largest async-commit LSN for the page. This
- * is a bit tedious, but since this entire function is a slow path
- * anyway, it seems better to do this here than to maintain a per-page
- * LSN variable (which'd need an extra comparison in the
- * transaction-commit path).
+ * As noted above, elog(ERROR) is not acceptable here, so if
+ * XLogFlush were to fail, we must PANIC. This isn't much of a
+ * restriction because XLogFlush is just about all critical
+ * section anyway, but let's make sure.
*/
- XLogRecPtr max_lsn;
- int lsnindex;
-
- lsnindex = slotno * shared->lsn_groups_per_page;
- max_lsn = shared->group_lsn[lsnindex++];
- for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
- {
- XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
-
- if (max_lsn < this_lsn)
- max_lsn = this_lsn;
- }
-
- if (!XLogRecPtrIsInvalid(max_lsn))
- {
- /*
- * As noted above, elog(ERROR) is not acceptable here, so if
- * XLogFlush were to fail, we must PANIC. This isn't much of a
- * restriction because XLogFlush is just about all critical
- * section anyway, but let's make sure.
- */
- START_CRIT_SECTION();
- XLogFlush(max_lsn);
- END_CRIT_SECTION();
- }
+ START_CRIT_SECTION();
+ XLogFlush(lsn);
+ END_CRIT_SECTION();
}
/*
@@ -971,6 +922,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
}
}
+ PageSetChecksumInplace(shared->page_buffer[slotno], pageno);
+
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
@@ -1091,6 +1044,13 @@ SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
errdetail("Could not close file \"%s\": %m.",
path)));
break;
+ case SLRU_DATA_CORRUPTED:
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Invalid page from file \"%s\" at offset %d.",
+ path, offset)));
+ break;
default:
/* can't get here, we trust */
elog(ERROR, "unrecognized SimpleLru error cause: %d",
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 50bb1d8cfc..5549cac3a6 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -34,6 +34,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "utils/guc_hooks.h"
+#include "storage/bufpage.h"
#include "utils/snapmgr.h"
@@ -51,7 +52,7 @@
*/
/* We need four bytes per xact */
-#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+#define SUBTRANS_XACTS_PER_PAGE (SizeOfPageContents / sizeof(TransactionId))
/*
* Although we return an int64 the actual value can't currently exceed
@@ -97,7 +98,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
/*
@@ -137,7 +138,7 @@ SubTransGetParent(TransactionId xid)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
parent = *ptr;
@@ -213,7 +214,7 @@ SUBTRANSShmemBuffers(void)
Size
SUBTRANSShmemSize(void)
{
- return SimpleLruShmemSize(SUBTRANSShmemBuffers(), 0);
+ return SimpleLruShmemSize(SUBTRANSShmemBuffers());
}
void
@@ -241,7 +242,7 @@ SUBTRANSShmemInit(void)
Assert(subtransaction_buffers != 0);
SubTransCtl->PagePrecedes = SubTransPagePrecedes;
- SimpleLruInit(SubTransCtl, "subtransaction", SUBTRANSShmemBuffers(), 0,
+ SimpleLruInit(SubTransCtl, "subtransaction", SUBTRANSShmemBuffers(),
"pg_subtrans", LWTRANCHE_SUBTRANS_BUFFER,
LWTRANCHE_SUBTRANS_SLRU, SYNC_HANDLER_NONE, false);
SlruPagePrecedesUnitTests(SubTransCtl, SUBTRANS_XACTS_PER_PAGE);
@@ -366,7 +367,6 @@ CheckPointSUBTRANS(void)
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that SUBTRANS has room for a newly-allocated XID.
*
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index d0891e3f0e..a400c54c03 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -140,6 +140,7 @@
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/procsignal.h"
@@ -160,7 +161,7 @@
* than that, so changes in that data structure won't affect user-visible
* restrictions.
*/
-#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128)
+#define NOTIFY_PAYLOAD_MAX_LENGTH (SizeOfPageContents - NAMEDATALEN - 128)
/*
* Struct representing an entry in the global notify queue
@@ -309,7 +310,7 @@ static SlruCtlData NotifyCtlData;
#define NotifyCtl (&NotifyCtlData)
#define QUEUE_PAGESIZE BLCKSZ
-
+#define QUEUE_PAGE_CAPACITY (QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData))
#define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */
/*
@@ -490,7 +491,7 @@ AsyncShmemSize(void)
size = mul_size(MaxBackends, sizeof(QueueBackendStatus));
size = add_size(size, offsetof(AsyncQueueControl, backend));
- size = add_size(size, SimpleLruShmemSize(notify_buffers, 0));
+ size = add_size(size, SimpleLruShmemSize(notify_buffers));
return size;
}
@@ -535,7 +536,7 @@ AsyncShmemInit(void)
* names are used in order to avoid wraparound.
*/
NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
- SimpleLruInit(NotifyCtl, "notify", notify_buffers, 0,
+ SimpleLruInit(NotifyCtl, "notify", notify_buffers,
"pg_notify", LWTRANCHE_NOTIFY_BUFFER, LWTRANCHE_NOTIFY_SLRU,
SYNC_HANDLER_NONE, true);
@@ -1295,14 +1296,14 @@ asyncQueueAdvance(volatile QueuePosition *position, int entryLength)
* written or read.
*/
offset += entryLength;
- Assert(offset <= QUEUE_PAGESIZE);
+ Assert(offset <= QUEUE_PAGE_CAPACITY);
/*
* In a second step check if another entry can possibly be written to the
* page. If so, stay here, we have reached the next position. If not, then
* we need to move on to the next page.
*/
- if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGESIZE)
+ if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGE_CAPACITY)
{
pageno++;
offset = 0;
@@ -1405,7 +1406,7 @@ asyncQueueAddEntries(ListCell *nextNotify)
offset = QUEUE_POS_OFFSET(queue_head);
/* Check whether the entry really fits on the current page */
- if (offset + qe.length <= QUEUE_PAGESIZE)
+ if (offset + qe.length <= QUEUE_PAGE_CAPACITY)
{
/* OK, so advance nextNotify past this item */
nextNotify = lnext(pendingNotifies->events, nextNotify);
@@ -1417,14 +1418,14 @@ asyncQueueAddEntries(ListCell *nextNotify)
* only check dboid and since it won't match any reader's database
* OID, they will ignore this entry and move on.
*/
- qe.length = QUEUE_PAGESIZE - offset;
+ qe.length = QUEUE_PAGE_CAPACITY - offset;
qe.dboid = InvalidOid;
qe.data[0] = '\0'; /* empty channel */
qe.data[1] = '\0'; /* empty payload */
}
/* Now copy qe into the shared buffer page */
- memcpy(NotifyCtl->shared->page_buffer[slotno] + offset,
+ memcpy(PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + offset,
&qe,
qe.length);
@@ -1955,10 +1956,10 @@ asyncQueueReadAllNotifications(void)
else
{
/* fetch all the rest of the page */
- copysize = QUEUE_PAGESIZE - curoffset;
+ copysize = QUEUE_PAGE_CAPACITY - curoffset;
}
- memcpy(page_buffer.buf + curoffset,
- NotifyCtl->shared->page_buffer[slotno] + curoffset,
+ memcpy(PageGetContents(page_buffer.buf) + curoffset,
+ PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + curoffset,
copysize);
/* Release lock that we got from SimpleLruReadPage_ReadOnly() */
LWLockRelease(SimpleLruGetBankLock(NotifyCtl, curpage));
@@ -2029,7 +2030,7 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current,
if (QUEUE_POS_EQUAL(thisentry, stop))
break;
- qe = (AsyncQueueEntry *) (page_buffer + QUEUE_POS_OFFSET(thisentry));
+ qe = (AsyncQueueEntry *) (PageGetContents(page_buffer) + QUEUE_POS_OFFSET(thisentry));
/*
* Advance *current over this message, possibly to the next page. As
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 3f378c0099..c26ae8ddd2 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -207,6 +207,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "port/pg_lfind.h"
+#include "storage/bufpage.h"
#include "storage/predicate.h"
#include "storage/predicate_internals.h"
#include "storage/proc.h"
@@ -326,8 +327,8 @@ static SlruCtlData SerialSlruCtlData;
#define SerialSlruCtl (&SerialSlruCtlData)
#define SERIAL_PAGESIZE BLCKSZ
-#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
-#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE - MAXALIGN(SizeOfPageHeaderData) / SERIAL_ENTRYSIZE)
/*
* Set maximum pages based on the number needed to track all transactions.
@@ -337,7 +338,7 @@ static SlruCtlData SerialSlruCtlData;
#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
- (SerialSlruCtl->shared->page_buffer[slotno] + \
+ (PageGetContents(SerialSlruCtl->shared->page_buffer[slotno]) + \
((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
@@ -789,10 +790,13 @@ SerialPagePrecedesLogicallyUnitTests(void)
* requires burning ~2B XIDs in single-user mode, a negligible
* possibility. Moreover, if it does happen, the consequence would be
* mild, namely a new transaction failing in SimpleLruReadPage().
+ *
+ * NOTE: After adding the page header, the defect affects two pages.
+ * We now assert correct treatment of its second to prior page.
*/
headPage = oldestPage;
targetPage = newestPage;
- Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+ Assert(SerialPagePrecedesLogically(headPage, targetPage - 2));
#if 0
Assert(SerialPagePrecedesLogically(headPage, targetPage));
#endif
@@ -812,7 +816,7 @@ SerialInit(void)
*/
SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
SimpleLruInit(SerialSlruCtl, "serializable",
- serializable_buffers, 0, "pg_serial",
+ serializable_buffers, "pg_serial",
LWTRANCHE_SERIAL_BUFFER, LWTRANCHE_SERIAL_SLRU,
SYNC_HANDLER_NONE, false);
#ifdef USE_ASSERT_CHECKING
@@ -1377,7 +1381,7 @@ PredicateLockShmemSize(void)
/* Shared memory structures for SLRU tracking of old committed xids. */
size = add_size(size, sizeof(SerialControlData));
- size = add_size(size, SimpleLruShmemSize(serializable_buffers, 0));
+ size = add_size(size, SimpleLruShmemSize(serializable_buffers));
return size;
}
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index be6f1f62d2..e8193d7f56 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -59,6 +59,31 @@ PageInit(Page page, Size pageSize, Size specialSize)
/* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
}
+/*
+ * PageInitSLRU
+ * Initializes the contents of an SLRU page.
+ * Note that we don't calculate an initial checksum here; that's not done
+ * until it's time to write.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
/*
* PageIsVerifiedExtended
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 9e6fd435f6..06f14f1d2d 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -16,6 +16,7 @@
#include <dirent.h>
#include <limits.h>
+#include <stdbool.h>
#include <time.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -593,12 +594,20 @@ main(int argc, char *argv[])
{
total_size = scan_directory(DataDir, "global", true);
total_size += scan_directory(DataDir, "base", true);
+ total_size += scan_directory(DataDir, "pg_commit_ts", true);
+ total_size += scan_directory(DataDir, "pg_multixact", true);
+ total_size += scan_directory(DataDir, "pg_serial", true);
total_size += scan_directory(DataDir, "pg_tblspc", true);
+ total_size += scan_directory(DataDir, "pg_xact", true);
}
(void) scan_directory(DataDir, "global", false);
(void) scan_directory(DataDir, "base", false);
+ (void) scan_directory(DataDir, "pg_commit_ts", false);
+ (void) scan_directory(DataDir, "pg_multixact", false);
+ (void) scan_directory(DataDir, "pg_serial", false);
(void) scan_directory(DataDir, "pg_tblspc", false);
+ (void) scan_directory(DataDir, "pg_xact", false);
if (showprogress)
progress_report(true);
diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl
index 9829e48106..7b9e034e19 100644
--- a/src/bin/pg_resetwal/t/001_basic.pl
+++ b/src/bin/pg_resetwal/t/001_basic.pl
@@ -206,7 +206,7 @@ push @cmd,
sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1]));
@files = get_slru_files('pg_multixact/offsets');
-$mult = 32 * $blcksz / 4;
+$mult = 32 * ($blcksz - 24) / 4;
# -m argument is "new,old"
push @cmd, '-m',
sprintf("%d,%d",
@@ -214,11 +214,11 @@ push @cmd, '-m',
hex($files[0]) == 0 ? 1 : hex($files[0] * $mult));
@files = get_slru_files('pg_multixact/members');
-$mult = 32 * int($blcksz / 20) * 4;
+$mult = 32 * int(($blcksz - 24) / 20) * 4;
push @cmd, '-O', (hex($files[-1]) + 1) * $mult;
@files = get_slru_files('pg_xact');
-$mult = 32 * $blcksz * 4;
+$mult = 32 * ($blcksz - 24) * 4;
push @cmd,
'-u', (hex($files[0]) == 0 ? 3 : hex($files[0]) * $mult),
'-x', ((hex($files[-1]) + 1) * $mult);
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index beba376f2e..6978410f17 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -9,6 +9,7 @@
#include "postgres_fe.h"
+#include <dirent.h>
#include <sys/stat.h>
#include <limits.h>
#include <fcntl.h>
@@ -453,3 +454,174 @@ check_hard_link(void)
unlink(new_link_file);
}
+
+
+/*
+ * Copy SLRU_PAGES_PER_SEGMENT from access/slru.h to avoid including it.
+ */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+#define SEGMENT_SIZE (BLCKSZ * SLRU_PAGES_PER_SEGMENT)
+
+/*
+ * Copy PageInitSLRU from storage/bufpage.c to avoid linking to the backend.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
+
+/*
+ * Filter function for scandir(3) to select only segment files.
+ */
+static int
+segment_file_filter(const struct dirent *dirent)
+{
+ return strspn(dirent->d_name, "0123456789ABCDEF") == strlen(dirent->d_name);
+}
+
+static void
+upgrade_file(const char *src_dir, const char *src_file, const char *dst_dir)
+{
+ char src[MAXPGPATH];
+ char dst[MAXPGPATH];
+
+ int seg_name_len;
+ int src_segno;
+ int64 src_pageno;
+ int dst_segno;
+ int64 dst_pageno;
+ int dst_offset;
+
+ int src_fd;
+ int dst_fd;
+
+ char *src_buf;
+ ssize_t src_len;
+ ssize_t src_buf_offset;
+ PGAlignedBlock dst_block;
+ Page page = dst_block.data;
+ int len_to_copy;
+
+ seg_name_len = strlen(src_file);
+ src_segno = (int) strtol(src_file, NULL, 16);
+ src_pageno = src_segno * SLRU_PAGES_PER_SEGMENT;
+
+ dst_pageno = src_pageno * BLCKSZ / SizeOfPageContents;
+ dst_offset = src_pageno * BLCKSZ - dst_pageno * SizeOfPageContents;
+ dst_segno = dst_pageno / SLRU_PAGES_PER_SEGMENT;
+
+ snprintf(src, sizeof(src), "%s/%s", src_dir, src_file);
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+
+ src_buf = pg_malloc(SEGMENT_SIZE);
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) == -1)
+ pg_fatal("could not open file \"%s\": %s", src, strerror(errno));
+ if ((src_len = read(src_fd, src_buf, SEGMENT_SIZE)) == -1)
+ pg_fatal("could not read file \"%s\": %s", src, strerror(errno));
+
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+
+ /*
+ * Read the destination page at dst_pageno into the buffer. The page may contain
+ * data from the previous source segment. Initialize the page if the page is new.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+ if (read(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not read file \"%s\": %s", dst, strerror(errno));
+ if (PageIsNew(page))
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Rewind the file position, so the first write will overwrite the page.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+
+ src_buf_offset = 0;
+ while (src_buf_offset < src_len)
+ {
+ len_to_copy = Min(src_len - src_buf_offset, SizeOfPageContents - dst_offset);
+ memcpy(PageGetContents(page) + dst_offset, src_buf + src_buf_offset, len_to_copy);
+ src_buf_offset += len_to_copy;
+
+ if (new_cluster.controldata.data_checksum_version > 0)
+ ((PageHeader) page)->pd_checksum = pg_checksum_page(page, dst_pageno);
+ if (write(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not write file \"%s\": %s", dst, strerror(errno));
+
+ dst_pageno++;
+ dst_offset = 0;
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Switch segments if we reached the end of the current segment.
+ */
+ if (dst_pageno % SLRU_PAGES_PER_SEGMENT == 0)
+ {
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ dst_segno++;
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+ }
+ }
+
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ pg_free(src_buf);
+ close(src_fd);
+}
+
+void
+upgrade_xact_cache(const char *src_subdir, const char *dst_subdir)
+{
+ char src_dir[MAXPGPATH];
+ char dst_dir[MAXPGPATH];
+
+ DIR *src_dirp;
+ struct dirent *src_dirent;
+
+ snprintf(src_dir, sizeof(src_dir), "%s/%s", old_cluster.pgdata, src_subdir);
+ snprintf(dst_dir, sizeof(dst_dir), "%s/%s", new_cluster.pgdata, dst_subdir);
+
+ if ((src_dirp = opendir(src_dir)) == NULL)
+ pg_fatal("could not open directory \"%s\": %s", src_dir, strerror(errno));
+
+ while (errno = 0, (src_dirent = readdir(src_dirp)) != NULL)
+ {
+ if (segment_file_filter(src_dirent))
+ upgrade_file(src_dir, src_dirent->d_name, dst_dir);
+ }
+
+ if (closedir(src_dirp) != 0)
+ pg_fatal("could not close directory \"%s\": %s", src_dir, strerror(errno));
+}
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 10c94a6c1f..ed4a0f5c43 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -660,14 +660,23 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir)
static void
copy_xact_xlog_xid(void)
{
+ bool slru_header_changed = false;
+
/*
* Copy old commit logs to new data dir. pg_clog has been renamed to
* pg_xact in post-10 clusters.
*/
- copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact",
- GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact");
+ char *xact_old_dir = GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+ char *xact_new_dir = GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+
+ if (new_cluster.controldata.cat_ver >= SLRU_PAGE_HEADER_CAT_VER &&
+ old_cluster.controldata.cat_ver < SLRU_PAGE_HEADER_CAT_VER)
+ slru_header_changed = true;
+
+ if (slru_header_changed)
+ upgrade_xact_cache(xact_old_dir, xact_new_dir);
+ else
+ copy_subdir_files(xact_old_dir, xact_new_dir);
prep_status("Setting oldest XID for new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
@@ -702,7 +711,8 @@ copy_xact_xlog_xid(void)
* server doesn't attempt to read multis older than the cutoff value.
*/
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
- new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
+ !slru_header_changed)
{
copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
copy_subdir_files("pg_multixact/members", "pg_multixact/members");
@@ -722,7 +732,8 @@ copy_xact_xlog_xid(void)
new_cluster.pgdata);
check_ok();
}
- else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER ||
+ slru_header_changed)
{
/*
* Remove offsets/0000 file created by initdb that no longer matches
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 857d715049..dc81020b62 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -115,6 +115,11 @@ extern char *output_files[];
*/
#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
+/*
+ * A page header was added to each SLRU page in 17.0.
+ */
+#define SLRU_PAGE_HEADER_CAT_VER 202403061
+
/*
* large object chunk size added to pg_controldata,
* commit 5f93c37805e7485488480916b4585e098d3cc883
@@ -412,6 +417,7 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile,
void check_file_clone(void);
void check_copy_file_range(void);
void check_hard_link(void);
+void upgrade_xact_cache(const char *src_subdir, const char *dst_subdir);
/* fopen_priv() is no longer different from fopen() */
#define fopen_priv(path, mode) fopen(path, mode)
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 8a8d191873..5d5bb13cb0 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -96,17 +96,6 @@ typedef struct SlruSharedData
*/
int *bank_cur_lru_count;
- /*
- * Optional array of WAL flush LSNs associated with entries in the SLRU
- * pages. If not zero/NULL, we must flush WAL before writing pages (true
- * for pg_xact, false for everything else). group_lsn[] has
- * lsn_groups_per_page entries per buffer slot, each containing the
- * highest LSN known for a contiguous group of SLRU entries on that slot's
- * page.
- */
- XLogRecPtr *group_lsn;
- int lsn_groups_per_page;
-
/*
* latest_page_number is the page number of the current end of the log;
* this is not critical data, since we use it only to avoid swapping out
@@ -184,9 +173,9 @@ SimpleLruGetBankLock(SlruCtl ctl, int64 pageno)
return &(ctl->shared->bank_locks[bankno].lock);
}
-extern Size SimpleLruShmemSize(int nslots, int nlsns);
+extern Size SimpleLruShmemSize(int nslots);
extern int SimpleLruAutotuneBuffers(int divisor, int max);
-extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
+extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots,
const char *subdir, int buffer_tranche_id,
int bank_tranche_id, SyncRequestHandler sync_handler,
bool long_segment_names);
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index bedc2a0d72..854fb01281 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202403052
+#define CATALOG_VERSION_NO 202403061
#endif
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index d0df02d39c..2dc83451a6 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -201,6 +201,7 @@ typedef PageHeaderData *PageHeader;
* handling pages.
*/
#define PG_PAGE_LAYOUT_VERSION 4
+#define PG_SLRU_PAGE_LAYOUT_VERSION 1
#define PG_DATA_CHECKSUM_VERSION 1
/* ----------------------------------------------------------------
@@ -257,6 +258,11 @@ PageGetContents(Page page)
return (char *) page + MAXALIGN(SizeOfPageHeaderData);
}
+/*
+ * Space available for storing page contents.
+ */
+#define SizeOfPageContents (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
/* ----------------
* functions to access page size info
* ----------------
@@ -486,6 +492,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
"BLCKSZ has to be a multiple of sizeof(size_t)");
extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern void PageInitSLRU(Page page, Size pageSize, Size specialSize);
extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags);
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
OffsetNumber offsetNumber, int flags);
diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c
index 068a21f125..026aa79912 100644
--- a/src/test/modules/test_slru/test_slru.c
+++ b/src/test/modules/test_slru/test_slru.c
@@ -17,6 +17,7 @@
#include "access/slru.h"
#include "access/transam.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
@@ -72,8 +73,8 @@ test_slru_page_write(PG_FUNCTION_ARGS)
TestSlruCtl->shared->page_status[slotno] = SLRU_PAGE_VALID;
/* write given data to the page, up to the limit of the page */
- strncpy(TestSlruCtl->shared->page_buffer[slotno], data,
- BLCKSZ - 1);
+ strncpy(PageGetContents(TestSlruCtl->shared->page_buffer[slotno]), data,
+ SizeOfPageContents - 1);
SimpleLruWritePage(TestSlruCtl, slotno);
LWLockRelease(lock);
@@ -101,7 +102,7 @@ test_slru_page_read(PG_FUNCTION_ARGS)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(TestSlruCtl, pageno,
write_ok, InvalidTransactionId);
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(lock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -120,7 +121,7 @@ test_slru_page_readonly(PG_FUNCTION_ARGS)
pageno,
InvalidTransactionId);
Assert(LWLockHeldByMe(lock));
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(lock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -201,7 +202,7 @@ test_slru_shmem_request(void)
prev_shmem_request_hook();
/* reserve shared memory for the test SLRU */
- RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS, 0));
+ RequestAddinShmemSpace(SimpleLruShmemSize(NUM_TEST_BUFFERS));
}
static bool
@@ -240,7 +241,7 @@ test_slru_shmem_startup(void)
TestSlruCtl->PagePrecedes = test_slru_page_precedes_logically;
SimpleLruInit(TestSlruCtl, "TestSLRU",
- NUM_TEST_BUFFERS, 0, slru_dir_name,
+ NUM_TEST_BUFFERS, slru_dir_name,
test_buffer_tranche_id, test_tranche_id, SYNC_HANDLER_NONE,
long_segment_names);
}
Hello,
I suppose this is important to do if we ever want to move SLRUs into
shared buffers. However, I wonder about the extra time this adds to
pg_upgrade. Is this something we should be concerned about? Is there
any measurement/estimates to tell us how long this would be? Right now,
if you use a cloning strategy for the data files, the upgrade should be
pretty quick ... but the amount of data in pg_xact and pg_multixact
could be massive, and the rewrite is likely to take considerable time.
--
Álvaro Herrera PostgreSQL Developer — https://www.EnterpriseDB.com/
"Las cosas son buenas o malas segun las hace nuestra opinión" (Lisias)
Greetings,
* Alvaro Herrera (alvherre@alvh.no-ip.org) wrote:
I suppose this is important to do if we ever want to move SLRUs into
shared buffers. However, I wonder about the extra time this adds to
pg_upgrade. Is this something we should be concerned about? Is there
any measurement/estimates to tell us how long this would be? Right now,
if you use a cloning strategy for the data files, the upgrade should be
pretty quick ... but the amount of data in pg_xact and pg_multixact
could be massive, and the rewrite is likely to take considerable time.
While I definitely agree that there should be some consideration of
this concern, it feels on-par with the visibility-map rewrite which was
done previously. Larger systems will likely have more to deal with than
smaller systems, but it's still a relatively small portion of the data
overall.
The benefit of this change, beyond just the possibility of moving them
into shared buffers some day in the future, is that this would mean that
SLRUs will have checksums (if the cluster has them enabled). That
benefit strikes me as well worth the cost of the rewrite taking some
time and the minor loss of space due to the page header.
Would it be useful to consider parallelizing this work? There's already
parts of pg_upgrade which can be parallelized and so this isn't,
hopefully, a big lift to add, but I'm not sure if there's enough work
being done here CPU-wise, compared to the amount of IO being done, to
have it make sense to run it in parallel. Might be worth looking into
though, at least, as disks have gotten to be quite fast.
Thanks!
Stephen
On Mar 7, 2024, at 03:09, Stephen Frost <sfrost@snowman.net> wrote:
External Email
From: Stephen Frost <sfrost@snowman.net>
Subject: Re: Proposal to add page headers to SLRU pages
Date: March 7, 2024 at 03:09:59 GMT+8
To: Alvaro Herrera <alvherre@alvh.no-ip.org>
Cc: "Li, Yong" <yoli@ebay.com>, Aleksander Alekseev <aleksander@timescale.com>, PostgreSQL Hackers <pgsql-hackers@lists.postgresql.org>, "Bagga, Rishu" <bagrishu@amazon.com>, Robert Haas <robertmhaas@gmail.com>, "Debnath, Shawn" <sdn@ebay.com>, Andrey Borodin <x4mmm@yandex-team.ru>, "Shyrabokau, Anton" <antons@ebay.com>Greetings,
* Alvaro Herrera (alvherre@alvh.no-ip.org) wrote:
I suppose this is important to do if we ever want to move SLRUs into
shared buffers. However, I wonder about the extra time this adds to
pg_upgrade. Is this something we should be concerned about? Is there
any measurement/estimates to tell us how long this would be? Right now,
if you use a cloning strategy for the data files, the upgrade should be
pretty quick ... but the amount of data in pg_xact and pg_multixact
could be massive, and the rewrite is likely to take considerable time.While I definitely agree that there should be some consideration of
this concern, it feels on-par with the visibility-map rewrite which was
done previously. Larger systems will likely have more to deal with than
smaller systems, but it's still a relatively small portion of the data
overall.The benefit of this change, beyond just the possibility of moving them
into shared buffers some day in the future, is that this would mean that
SLRUs will have checksums (if the cluster has them enabled). That
benefit strikes me as well worth the cost of the rewrite taking some
time and the minor loss of space due to the page header.Would it be useful to consider parallelizing this work? There's already
parts of pg_upgrade which can be parallelized and so this isn't,
hopefully, a big lift to add, but I'm not sure if there's enough work
being done here CPU-wise, compared to the amount of IO being done, to
have it make sense to run it in parallel. Might be worth looking into
though, at least, as disks have gotten to be quite fast.Thanks!
Stephen
Thank Alvaro and Stephen for your thoughtful comments.
I did a quick benchmark regarding pg_upgrade time, and here are the results.
Hardware spec:
MacBook Pro M1 Max - 10 cores, 64GB memory, 1TB Apple SSD
Operating system:
macOS 14.3.1
Complier:
Apple clang 15.0.0
Compiler optimization level: -O2
====
PG setups:
Old cluster: PG 16.2 release (source build)
New cluster: PG Git HEAD plus the patch (source build)
====
Benchmark steps:
1. Initdb for PG 16.2.
2. Initdb for PG HEAD.
3. Run pg_upgrade on the above empty database, and time the overall wall clock time.
4. In the old cluster, write 512MB all-zero dummy segment files (2048 segments) under pg_xact.
5. In the old cluster, write 512MB all-zero dummy segment files under pg_multixact/members.
6. In the old cluster, write 512MB all-zero dummy segment files under pg_multixact/offsets.
7. Purge the OS page cache.
7. Run pg_upgrade again, and time the overall wall clock time.
====
Test result:
On the empty database, pg_upgrade took 4.8 seconds to complete.
With 1.5GB combined SLRU data to convert, pg_upgrade took 11.5 seconds to complete.
It took 6.7 seconds to convert 1.5GB SLRU files for pg_upgrade.
====
For clog, 2048 segments can host about 2 billion transactions, right at the limit for wraparound.
That’s the maximum we can have. 2048 segments are also big for pg_multixact SLRUs.
Therefore, on a modern hardware, in the worst case, pg_upgrade will run for 7 seconds longer.
Regards,
Yong
Greetings,
* Li, Yong (yoli@ebay.com) wrote:
On Mar 7, 2024, at 03:09, Stephen Frost <sfrost@snowman.net> wrote:
* Alvaro Herrera (alvherre@alvh.no-ip.org) wrote:I suppose this is important to do if we ever want to move SLRUs into
shared buffers. However, I wonder about the extra time this adds to
pg_upgrade. Is this something we should be concerned about? Is there
any measurement/estimates to tell us how long this would be? Right now,
if you use a cloning strategy for the data files, the upgrade should be
pretty quick ... but the amount of data in pg_xact and pg_multixact
could be massive, and the rewrite is likely to take considerable time.While I definitely agree that there should be some consideration of
this concern, it feels on-par with the visibility-map rewrite which was
done previously. Larger systems will likely have more to deal with than
smaller systems, but it's still a relatively small portion of the data
overall.The benefit of this change, beyond just the possibility of moving them
into shared buffers some day in the future, is that this would mean that
SLRUs will have checksums (if the cluster has them enabled). That
benefit strikes me as well worth the cost of the rewrite taking some
time and the minor loss of space due to the page header.Would it be useful to consider parallelizing this work? There's already
parts of pg_upgrade which can be parallelized and so this isn't,
hopefully, a big lift to add, but I'm not sure if there's enough work
being done here CPU-wise, compared to the amount of IO being done, to
have it make sense to run it in parallel. Might be worth looking into
though, at least, as disks have gotten to be quite fast.Thank Alvaro and Stephen for your thoughtful comments.
I did a quick benchmark regarding pg_upgrade time, and here are the results.
For clog, 2048 segments can host about 2 billion transactions, right at the limit for wraparound.
That’s the maximum we can have. 2048 segments are also big for pg_multixact SLRUs.Therefore, on a modern hardware, in the worst case, pg_upgrade will run for 7 seconds longer.
Thanks for testing! That strikes me as perfectly reasonable and seems
unlikely that we'd get much benefit from parallelizing it, so I'd say it
makes sense to keep this code simple.
Thanks again!
Stephen
On 2024-Mar-08, Stephen Frost wrote:
Thanks for testing! That strikes me as perfectly reasonable and seems
unlikely that we'd get much benefit from parallelizing it, so I'd say it
makes sense to keep this code simple.
Okay, agreed, that amount of time sounds reasonable to me too; but I
don't want to be responsible for this at least for pg17. If some other
committer wants to take it, be my guest. However, I think this is
mostly a foundation for building other things on top, so committing
during the last commitfest is perhaps not very useful.
Another aspect of this patch is the removal of the LSN groups. There's
an explanation of the LSN groups in src/backend/access/transam/README,
and while this patch removes the LSN group feature, it doesn't update
that text. That's already a problem which needs fixed, but the text
says
: In fact, we store more than one LSN for each clog page. This relates to
: the way we set transaction status hint bits during visibility tests.
: We must not set a transaction-committed hint bit on a relation page and
: have that record make it to disk prior to the WAL record of the commit.
: Since visibility tests are normally made while holding buffer share locks,
: we do not have the option of changing the page's LSN to guarantee WAL
: synchronization. Instead, we defer the setting of the hint bit if we have
: not yet flushed WAL as far as the LSN associated with the transaction.
: This requires tracking the LSN of each unflushed async commit.
: It is convenient to associate this data with clog buffers: because we
: will flush WAL before writing a clog page, we know that we do not need
: to remember a transaction's LSN longer than the clog page holding its
: commit status remains in memory. However, the naive approach of storing
: an LSN for each clog position is unattractive: the LSNs are 32x bigger
: than the two-bit commit status fields, and so we'd need 256K of
: additional shared memory for each 8K clog buffer page. We choose
: instead to store a smaller number of LSNs per page, where each LSN is
: the highest LSN associated with any transaction commit in a contiguous
: range of transaction IDs on that page. This saves storage at the price
: of some possibly-unnecessary delay in setting transaction hint bits.
In the new code we effectively store only one LSN per page, which I
understand is strictly worse. Maybe the idea of doing away with these
LSN groups should be reconsidered ... unless I completely misunderstand
the whole thing.
--
Álvaro Herrera PostgreSQL Developer — https://www.EnterpriseDB.com/
On Fri, 2024-03-08 at 13:58 +0100, Alvaro Herrera wrote:
In the new code we effectively store only one LSN per page, which I
understand is strictly worse.
To quote from the README:
"Instead, we defer the setting of the hint bit if we have not yet
flushed WAL as far as the LSN associated with the transaction. This
requires tracking the LSN of each unflushed async commit."
In other words, the problem the group_lsns are solving is that we can't
set the hint bit on a tuple until the commit record for that
transaction has actually been flushed. For ordinary sync commit, that's
fine, because the CLOG bit isn't set until after the commit record is
flushed. But for async commit, the CLOG may be updated before the WAL
is flushed, and group_lsns are one way to track the right information
to hold off updating the hint bits.
"It is convenient to associate this data with clog buffers: because we
will flush WAL before writing a clog page, we know that we do not need
to remember a transaction's LSN longer than the clog page holding its
commit status remains in memory."
It's not clear to me that it is so convenient, if it's preventing the
SLRU from fitting in with the rest of the system architecture.
"The worst case is where a sync-commit xact shares a cached LSN with an
async-commit xact that commits a bit later; even though we paid to sync
the first xact to disk, we won't be able to hint its outputs until the
second xact is sync'd, up to three walwriter cycles later."
Perhaps we can revisit alternatives to the group_lsn? If we accept
Yong's proposal, and the SLRU has a normal LSN and was used in the
normal way, we would just need some kind of secondary structure to hold
a mapping from XID->LSN only for async transactions.
The characteristics we are looking for in this secondary structure are:
1. cheap to test if it's empty, so it doesn't interfere with a purely
sync workload at all
2. expire old entries (where the LSN has already been flushed)
cheaply enough so the data structure doesn't bloat
3. look up an LSN given an XID cheaply enough that it doesn't
interfere with setting hint bits
Making a better secondary structure seems doable to me. Just to
brainstorm:
* Have an open-addressed hash table mapping async XIDs to their
commit LSN. If you have a hash collision, opportunistically see if the
entry is old and can be removed. Try K probes, and if they are all
recent, then you need to XLogFlush. The table could get pretty big,
because it needs to hold enough async transactions for a wal writer
cycle or two, but it seems reasonable to make async workloads pay that
memory cost.
* Another idea, if the size of the structure is a problem, is to
group K async xids into a bloom filter that points at a single LSN.
When more transactions come along, create another bloom filter for the
next K async xids. This could interfere with setting hint bits for sync
xids if the bloom filters are undersized, but that doesn't seem like a
big problem.
Regards,
Jeff Davis
On Fri, 2024-03-08 at 12:39 -0800, Jeff Davis wrote:
Making a better secondary structure seems doable to me. Just to
brainstorm:
We can also keep the group_lsns, and then just update both the CLOG
page LSN and the group_lsn when setting the transaction status. The
former would be used for all of the normal WAL-related stuff, and the
latter would be used by TransactionIdGetStatus() to return the more
precise LSN for that group.
Regards,
Jeff Davis
On Wed, 2024-03-06 at 12:01 +0000, Li, Yong wrote:
Rebase the patch against the latest HEAD.
The upgrade logic could use more comments explaining what's going on
and why. As I understand it, it's a one-time conversion that needs to
happen between 16 and 17. Is that right?
Was the way CLOG is upgraded already decided in some earlier
discussion?
Given that the CLOG is append-only and gets truncated occasionally, I
wonder whether we can just have some marker that xids before some
number are the old CLOG, and xids beyond that number are in the new
CLOG. I'm not necessarily suggesting that; just an idea.
Regards,
Jeff Davis
On Mar 9, 2024, at 05:22, Jeff Davis <pgsql@j-davis.com> wrote:
External Email
On Wed, 2024-03-06 at 12:01 +0000, Li, Yong wrote:
Rebase the patch against the latest HEAD.
The upgrade logic could use more comments explaining what's going on
and why. As I understand it, it's a one-time conversion that needs to
happen between 16 and 17. Is that right?Regards,
Jeff Davis
In the new code we effectively store only one LSN per page, which I
understand is strictly worse. Maybe the idea of doing away with these
LSN groups should be reconsidered ... unless I completely misunderstand
the whole thing.--
Álvaro Herrera PostgreSQL Developer —
Thanks for the comments on LSN groups and pg_upgrade.
I have updated the patch to address both comments:
- The clog LSN group has been brought back.
Now the page LSN on each clog page is used for honoring the write-ahead rule
and it is always the highest LSN of all the LSN groups on the page.
The LSN groups are used by TransactionIdGetStatus() as before.
- New comments have been added to pg_upgrade to mention the SLRU
page header change as the reason for upgrading clog files.
Regards,
Yong
Attachments:
slru_page_header_v6.patchapplication/octet-stream; name=slru_page_header_v6.patchDownload
src/backend/access/transam/clog.c | 45 ++++++---
src/backend/access/transam/commit_ts.c | 22 ++--
src/backend/access/transam/multixact.c | 55 ++++++----
src/backend/access/transam/slru.c | 68 ++++++-------
src/backend/access/transam/subtrans.c | 8 +-
src/backend/commands/async.c | 23 +++--
src/backend/storage/lmgr/predicate.c | 12 ++-
src/backend/storage/page/bufpage.c | 25 +++++
src/bin/pg_checksums/pg_checksums.c | 9 ++
src/bin/pg_resetwal/t/001_basic.pl | 6 +-
src/bin/pg_upgrade/file.c | 178 +++++++++++++++++++++++++++++++++
src/bin/pg_upgrade/pg_upgrade.c | 28 ++++--
src/bin/pg_upgrade/pg_upgrade.h | 6 ++
src/include/catalog/catversion.h | 2 +-
src/include/storage/bufpage.h | 7 ++
src/test/modules/test_slru/test_slru.c | 9 +-
16 files changed, 395 insertions(+), 108 deletions(-)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 44c253246b..86348e242b 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -42,6 +42,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/proc.h"
#include "storage/sync.h"
#include "utils/guc_hooks.h"
@@ -61,7 +62,7 @@
/* We need two bits per xact, so four xacts fit in a byte */
#define CLOG_BITS_PER_XACT 2
#define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACTS_PER_PAGE (SizeOfPageContents * CLOG_XACTS_PER_BYTE)
#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
/*
@@ -90,7 +91,13 @@ TransactionIdToPage(TransactionId xid)
/* We store the latest async LSN for each group of transactions */
#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
-#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
+
+/*
+ * Use BLCKSZ instead of SizeOfPageContents so that CLOG_LSNS_PER_PAGE is
+ * a power of 2. Using BLCKSZ wastes the last 4 LSN groups per page, but
+ * this is acceptable given that each page has 1,024 LSN groups.
+ */
+#define CLOG_LSNS_PER_PAGE ((BLCKSZ * CLOG_XACTS_PER_BYTE) / CLOG_XACTS_PER_LSN_GROUP)
#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
@@ -112,7 +119,7 @@ static SlruCtlData XactCtlData;
static int ZeroCLOGPage(int64 pageno, bool writeXlog);
static bool CLOGPagePrecedes(int64 page1, int64 page2);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
Oid oldestXactDb);
static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
@@ -665,13 +672,15 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
char *byteptr;
char byteval;
char curval;
+ Page page;
Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(xid));
Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl,
XactCtl->shared->page_number[slotno]),
LW_EXCLUSIVE));
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
/*
@@ -700,7 +709,7 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*byteptr = byteval;
/*
- * Update the group LSN if the transaction completion LSN is higher.
+ * Update the page & group LSN if the transaction completion LSN is higher.
*
* Note: lsn will be invalid when supplied during InRecovery processing,
* so we don't need to do anything special to avoid LSN updates during
@@ -709,10 +718,13 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*/
if (!XLogRecPtrIsInvalid(lsn))
{
- int lsnindex = GetLSNIndex(slotno, xid);
+ int lsnindex = GetLSNIndex(slotno, xid);
if (XactCtl->shared->group_lsn[lsnindex] < lsn)
XactCtl->shared->group_lsn[lsnindex] = lsn;
+
+ if (PageGetLSN(page) < lsn)
+ PageSetLSN(page, lsn);
}
}
@@ -739,13 +751,15 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
int slotno;
int lsnindex;
+ Page page;
char *byteptr;
XidStatus status;
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
@@ -860,11 +874,17 @@ static int
ZeroCLOGPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(XactCtl, pageno);
+ page = XactCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -917,12 +937,12 @@ TrimCLOG(void)
char *byteptr;
slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(XactCtl->shared->page_buffer[slotno]) + byteno;
/* Zero so-far-unused positions in the current byte */
*byteptr &= (1 << bshift) - 1;
/* Zero the rest of the page */
- MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+ MemSet(byteptr + 1, 0, SizeOfPageContents - byteno - 1);
XactCtl->shared->page_dirty[slotno] = true;
}
@@ -946,7 +966,6 @@ CheckPointCLOG(void)
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that CLOG has room for a newly-allocated XID.
*
@@ -1070,12 +1089,12 @@ CLOGPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
+ return XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index f221494687..99f9fc5076 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -29,6 +29,7 @@
#include "access/xlogutils.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/shmem.h"
#include "utils/fmgrprotos.h"
#include "utils/guc_hooks.h"
@@ -61,7 +62,7 @@ typedef struct CommitTimestampEntry
sizeof(RepOriginId))
#define COMMIT_TS_XACTS_PER_PAGE \
- (BLCKSZ / SizeOfCommitTimestampEntry)
+ (SizeOfPageContents / SizeOfCommitTimestampEntry)
/*
@@ -118,7 +119,7 @@ static int ZeroCommitTsPage(int64 pageno, bool writeXlog);
static bool CommitTsPagePrecedes(int64 page1, int64 page2);
static void ActivateCommitTs(void);
static void DeactivateCommitTs(void);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid);
/*
@@ -253,11 +254,12 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
CommitTimestampEntry entry;
Assert(TransactionIdIsNormal(xid));
+ Assert(xid == slotno * COMMIT_TS_XACTS_PER_PAGE + entryno);
entry.time = ts;
entry.nodeid = nodeid;
- memcpy(CommitTsCtl->shared->page_buffer[slotno] +
+ memcpy(PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
&entry, SizeOfCommitTimestampEntry);
}
@@ -336,7 +338,7 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
memcpy(&entry,
- CommitTsCtl->shared->page_buffer[slotno] +
+ PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
SizeOfCommitTimestampEntry * entryno,
SizeOfCommitTimestampEntry);
@@ -615,11 +617,17 @@ static int
ZeroCommitTsPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+ page = CommitTsCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -985,12 +993,12 @@ CommitTsPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
+ return XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 83b578dced..b6f5428327 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -83,6 +83,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "postmaster/autovacuum.h"
+#include "storage/bufpage.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "storage/procarray.h"
@@ -105,7 +106,7 @@
*/
/* We need four bytes per offset */
-#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+#define MULTIXACT_OFFSETS_PER_PAGE (SizeOfPageContents / sizeof(MultiXactOffset))
#define MultiXactIdToOffsetPage(xid) \
((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
@@ -118,8 +119,8 @@
* additional flag bits for each TransactionId. To do this without getting
* into alignment issues, we store four bytes of flags, and then the
* corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
- * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
- * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 408 groups
+ * per page. This wastes 8 bytes per page, but that's OK -- simplicity (and
* performance) trumps space efficiency here.
*
* Note that the "offset" macros work with byte offset, not array indexes, so
@@ -137,7 +138,7 @@
/* size in bytes of a complete group */
#define MULTIXACT_MEMBERGROUP_SIZE \
(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
-#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (SizeOfPageContents / MULTIXACT_MEMBERGROUP_SIZE)
#define MULTIXACT_MEMBERS_PER_PAGE \
(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
@@ -364,7 +365,7 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
MultiXactOffset start, uint32 distance);
static bool SetOffsetVacuumLimit(bool is_startup);
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
-static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
+static XLogRecPtr WriteMZeroPageXlogRec(int64 pageno, uint8 info);
static void WriteMTruncateXlogRec(Oid oldestMultiDB,
MultiXactId startTruncOff,
MultiXactId endTruncOff,
@@ -885,7 +886,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
* take the trouble to generalize the slru.c error reporting code.
*/
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
*offptr = offset;
@@ -934,12 +935,12 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
}
memberptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
*memberptr = members[i].xid;
flagsptr = (uint32 *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
flagsval = *flagsptr;
flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
@@ -1364,7 +1365,7 @@ retry:
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
@@ -1413,7 +1414,7 @@ retry:
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
}
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
nextMXOffset = *offptr;
@@ -1470,7 +1471,7 @@ retry:
}
xactptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
if (!TransactionIdIsValid(*xactptr))
{
@@ -1481,7 +1482,7 @@ retry:
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
- flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ flagsptr = (uint32 *) (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
ptr[truelength].xid = *xactptr;
ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
@@ -1999,11 +2000,17 @@ static int
ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+ page = MultiXactOffsetCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2015,11 +2022,17 @@ static int
ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
+ page = MultiXactMemberCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2143,10 +2156,10 @@ TrimMultiXact(void)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
- MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+ MemSet(offptr, 0, SizeOfPageContents - (entryno * sizeof(MultiXactOffset)));
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
LWLockRelease(lock);
@@ -2177,9 +2190,9 @@ TrimMultiXact(void)
memberoff = MXOffsetToMemberOffset(offset);
slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
xidptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
- MemSet(xidptr, 0, BLCKSZ - memberoff);
+ MemSet(xidptr, 0, SizeOfPageContents - memberoff);
/*
* Note: we don't need to zero out the flag bits in the remaining
@@ -2834,7 +2847,7 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno));
@@ -3268,12 +3281,12 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
* Write an xlog record reflecting the zeroing of either a MEMBERs or
* OFFSETs page (info shows which)
*/
-static void
+static XLogRecPtr
WriteMZeroPageXlogRec(int64 pageno, uint8 info)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_MULTIXACT_ID, info);
+ return XLogInsert(RM_MULTIXACT_ID, info);
}
/*
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index f99ec38a4a..189b776cef 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -68,6 +68,7 @@
#include "access/xlogutils.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/shmem.h"
#include "utils/guc_hooks.h"
@@ -155,6 +156,7 @@ typedef enum
SLRU_WRITE_FAILED,
SLRU_FSYNC_FAILED,
SLRU_CLOSE_FAILED,
+ SLRU_DATA_CORRUPTED,
} SlruErrorCause;
static SlruErrorCause slru_errcause;
@@ -378,8 +380,8 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
shared->page_dirty[slotno] = true;
SlruRecentlyUsed(shared, slotno);
- /* Set the buffer to zeroes */
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ /* Initialize the page. */
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
/* Set the LSNs for this new page to zero */
SimpleLruZeroLSNs(ctl, slotno);
@@ -815,7 +817,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
ereport(LOG,
(errmsg("file \"%s\" doesn't exist, reading as zeroes",
path)));
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
return true;
}
@@ -838,6 +840,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
return false;
}
+ if (!PageIsVerifiedExtended(shared->page_buffer[slotno], pageno, PIV_REPORT_STAT))
+ {
+ slru_errcause = SLRU_DATA_CORRUPTED;
+ slru_errno = 0;
+ return false;
+ }
+
return true;
}
@@ -864,6 +873,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd = -1;
+ Page page = shared->page_buffer[slotno];
+ XLogRecPtr lsn;
/* update the stats counter of written pages */
pgstat_count_slru_page_written(shared->slru_stats_idx);
@@ -872,41 +883,21 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
* Honor the write-WAL-before-data rule, if appropriate, so that we do not
* write out data before associated WAL records. This is the same action
* performed during FlushBuffer() in the main buffer manager.
+ *
+ * The largest async-commit LSN for the page is maintained through page LSN.
*/
- if (shared->group_lsn != NULL)
+ lsn = PageGetLSN(page);
+ if (!XLogRecPtrIsInvalid(lsn))
{
/*
- * We must determine the largest async-commit LSN for the page. This
- * is a bit tedious, but since this entire function is a slow path
- * anyway, it seems better to do this here than to maintain a per-page
- * LSN variable (which'd need an extra comparison in the
- * transaction-commit path).
+ * As noted above, elog(ERROR) is not acceptable here, so if
+ * XLogFlush were to fail, we must PANIC. This isn't much of a
+ * restriction because XLogFlush is just about all critical
+ * section anyway, but let's make sure.
*/
- XLogRecPtr max_lsn;
- int lsnindex;
-
- lsnindex = slotno * shared->lsn_groups_per_page;
- max_lsn = shared->group_lsn[lsnindex++];
- for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
- {
- XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
-
- if (max_lsn < this_lsn)
- max_lsn = this_lsn;
- }
-
- if (!XLogRecPtrIsInvalid(max_lsn))
- {
- /*
- * As noted above, elog(ERROR) is not acceptable here, so if
- * XLogFlush were to fail, we must PANIC. This isn't much of a
- * restriction because XLogFlush is just about all critical
- * section anyway, but let's make sure.
- */
- START_CRIT_SECTION();
- XLogFlush(max_lsn);
- END_CRIT_SECTION();
- }
+ START_CRIT_SECTION();
+ XLogFlush(lsn);
+ END_CRIT_SECTION();
}
/*
@@ -971,6 +962,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
}
}
+ PageSetChecksumInplace(shared->page_buffer[slotno], pageno);
+
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
@@ -1091,6 +1084,13 @@ SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
errdetail("Could not close file \"%s\": %m.",
path)));
break;
+ case SLRU_DATA_CORRUPTED:
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Invalid page from file \"%s\" at offset %d.",
+ path, offset)));
+ break;
default:
/* can't get here, we trust */
elog(ERROR, "unrecognized SimpleLru error cause: %d",
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 50bb1d8cfc..869ae7a25d 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -34,6 +34,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "utils/guc_hooks.h"
+#include "storage/bufpage.h"
#include "utils/snapmgr.h"
@@ -51,7 +52,7 @@
*/
/* We need four bytes per xact */
-#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+#define SUBTRANS_XACTS_PER_PAGE (SizeOfPageContents / sizeof(TransactionId))
/*
* Although we return an int64 the actual value can't currently exceed
@@ -97,7 +98,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
/*
@@ -137,7 +138,7 @@ SubTransGetParent(TransactionId xid)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
parent = *ptr;
@@ -366,7 +367,6 @@ CheckPointSUBTRANS(void)
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that SUBTRANS has room for a newly-allocated XID.
*
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index d0891e3f0e..a4cb773f73 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -140,6 +140,7 @@
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/procsignal.h"
@@ -160,7 +161,7 @@
* than that, so changes in that data structure won't affect user-visible
* restrictions.
*/
-#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128)
+#define NOTIFY_PAYLOAD_MAX_LENGTH (SizeOfPageContents - NAMEDATALEN - 128)
/*
* Struct representing an entry in the global notify queue
@@ -309,7 +310,7 @@ static SlruCtlData NotifyCtlData;
#define NotifyCtl (&NotifyCtlData)
#define QUEUE_PAGESIZE BLCKSZ
-
+#define QUEUE_PAGE_CAPACITY (QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData))
#define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */
/*
@@ -1295,14 +1296,14 @@ asyncQueueAdvance(volatile QueuePosition *position, int entryLength)
* written or read.
*/
offset += entryLength;
- Assert(offset <= QUEUE_PAGESIZE);
+ Assert(offset <= QUEUE_PAGE_CAPACITY);
/*
* In a second step check if another entry can possibly be written to the
* page. If so, stay here, we have reached the next position. If not, then
* we need to move on to the next page.
*/
- if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGESIZE)
+ if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGE_CAPACITY)
{
pageno++;
offset = 0;
@@ -1405,7 +1406,7 @@ asyncQueueAddEntries(ListCell *nextNotify)
offset = QUEUE_POS_OFFSET(queue_head);
/* Check whether the entry really fits on the current page */
- if (offset + qe.length <= QUEUE_PAGESIZE)
+ if (offset + qe.length <= QUEUE_PAGE_CAPACITY)
{
/* OK, so advance nextNotify past this item */
nextNotify = lnext(pendingNotifies->events, nextNotify);
@@ -1417,14 +1418,14 @@ asyncQueueAddEntries(ListCell *nextNotify)
* only check dboid and since it won't match any reader's database
* OID, they will ignore this entry and move on.
*/
- qe.length = QUEUE_PAGESIZE - offset;
+ qe.length = QUEUE_PAGE_CAPACITY - offset;
qe.dboid = InvalidOid;
qe.data[0] = '\0'; /* empty channel */
qe.data[1] = '\0'; /* empty payload */
}
/* Now copy qe into the shared buffer page */
- memcpy(NotifyCtl->shared->page_buffer[slotno] + offset,
+ memcpy(PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + offset,
&qe,
qe.length);
@@ -1955,10 +1956,10 @@ asyncQueueReadAllNotifications(void)
else
{
/* fetch all the rest of the page */
- copysize = QUEUE_PAGESIZE - curoffset;
+ copysize = QUEUE_PAGE_CAPACITY - curoffset;
}
- memcpy(page_buffer.buf + curoffset,
- NotifyCtl->shared->page_buffer[slotno] + curoffset,
+ memcpy(PageGetContents(page_buffer.buf) + curoffset,
+ PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + curoffset,
copysize);
/* Release lock that we got from SimpleLruReadPage_ReadOnly() */
LWLockRelease(SimpleLruGetBankLock(NotifyCtl, curpage));
@@ -2029,7 +2030,7 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current,
if (QUEUE_POS_EQUAL(thisentry, stop))
break;
- qe = (AsyncQueueEntry *) (page_buffer + QUEUE_POS_OFFSET(thisentry));
+ qe = (AsyncQueueEntry *) (PageGetContents(page_buffer) + QUEUE_POS_OFFSET(thisentry));
/*
* Advance *current over this message, possibly to the next page. As
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 3f378c0099..1cc664eee3 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -207,6 +207,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "port/pg_lfind.h"
+#include "storage/bufpage.h"
#include "storage/predicate.h"
#include "storage/predicate_internals.h"
#include "storage/proc.h"
@@ -326,8 +327,8 @@ static SlruCtlData SerialSlruCtlData;
#define SerialSlruCtl (&SerialSlruCtlData)
#define SERIAL_PAGESIZE BLCKSZ
-#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
-#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE - MAXALIGN(SizeOfPageHeaderData) / SERIAL_ENTRYSIZE)
/*
* Set maximum pages based on the number needed to track all transactions.
@@ -337,7 +338,7 @@ static SlruCtlData SerialSlruCtlData;
#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
- (SerialSlruCtl->shared->page_buffer[slotno] + \
+ (PageGetContents(SerialSlruCtl->shared->page_buffer[slotno]) + \
((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
@@ -789,10 +790,13 @@ SerialPagePrecedesLogicallyUnitTests(void)
* requires burning ~2B XIDs in single-user mode, a negligible
* possibility. Moreover, if it does happen, the consequence would be
* mild, namely a new transaction failing in SimpleLruReadPage().
+ *
+ * NOTE: After adding the page header, the defect affects two pages.
+ * We now assert correct treatment of its second to prior page.
*/
headPage = oldestPage;
targetPage = newestPage;
- Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+ Assert(SerialPagePrecedesLogically(headPage, targetPage - 2));
#if 0
Assert(SerialPagePrecedesLogically(headPage, targetPage));
#endif
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index be6f1f62d2..e8193d7f56 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -59,6 +59,31 @@ PageInit(Page page, Size pageSize, Size specialSize)
/* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
}
+/*
+ * PageInitSLRU
+ * Initializes the contents of an SLRU page.
+ * Note that we don't calculate an initial checksum here; that's not done
+ * until it's time to write.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
/*
* PageIsVerifiedExtended
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 9e6fd435f6..06f14f1d2d 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -16,6 +16,7 @@
#include <dirent.h>
#include <limits.h>
+#include <stdbool.h>
#include <time.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -593,12 +594,20 @@ main(int argc, char *argv[])
{
total_size = scan_directory(DataDir, "global", true);
total_size += scan_directory(DataDir, "base", true);
+ total_size += scan_directory(DataDir, "pg_commit_ts", true);
+ total_size += scan_directory(DataDir, "pg_multixact", true);
+ total_size += scan_directory(DataDir, "pg_serial", true);
total_size += scan_directory(DataDir, "pg_tblspc", true);
+ total_size += scan_directory(DataDir, "pg_xact", true);
}
(void) scan_directory(DataDir, "global", false);
(void) scan_directory(DataDir, "base", false);
+ (void) scan_directory(DataDir, "pg_commit_ts", false);
+ (void) scan_directory(DataDir, "pg_multixact", false);
+ (void) scan_directory(DataDir, "pg_serial", false);
(void) scan_directory(DataDir, "pg_tblspc", false);
+ (void) scan_directory(DataDir, "pg_xact", false);
if (showprogress)
progress_report(true);
diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl
index 9829e48106..7b9e034e19 100644
--- a/src/bin/pg_resetwal/t/001_basic.pl
+++ b/src/bin/pg_resetwal/t/001_basic.pl
@@ -206,7 +206,7 @@ push @cmd,
sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1]));
@files = get_slru_files('pg_multixact/offsets');
-$mult = 32 * $blcksz / 4;
+$mult = 32 * ($blcksz - 24) / 4;
# -m argument is "new,old"
push @cmd, '-m',
sprintf("%d,%d",
@@ -214,11 +214,11 @@ push @cmd, '-m',
hex($files[0]) == 0 ? 1 : hex($files[0] * $mult));
@files = get_slru_files('pg_multixact/members');
-$mult = 32 * int($blcksz / 20) * 4;
+$mult = 32 * int(($blcksz - 24) / 20) * 4;
push @cmd, '-O', (hex($files[-1]) + 1) * $mult;
@files = get_slru_files('pg_xact');
-$mult = 32 * $blcksz * 4;
+$mult = 32 * ($blcksz - 24) * 4;
push @cmd,
'-u', (hex($files[0]) == 0 ? 3 : hex($files[0]) * $mult),
'-x', ((hex($files[-1]) + 1) * $mult);
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index beba376f2e..663525f364 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -9,6 +9,7 @@
#include "postgres_fe.h"
+#include <dirent.h>
#include <sys/stat.h>
#include <limits.h>
#include <fcntl.h>
@@ -453,3 +454,180 @@ check_hard_link(void)
unlink(new_link_file);
}
+
+
+/*
+ * Copy SLRU_PAGES_PER_SEGMENT from access/slru.h to avoid including it.
+ */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+#define SEGMENT_SIZE (BLCKSZ * SLRU_PAGES_PER_SEGMENT)
+
+/*
+ * Copy PageInitSLRU from storage/bufpage.c to avoid linking to the backend.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
+
+/*
+ * Filter function for scandir(3) to select only segment files.
+ */
+static int
+segment_file_filter(const struct dirent *dirent)
+{
+ return strspn(dirent->d_name, "0123456789ABCDEF") == strlen(dirent->d_name);
+}
+
+/*
+ * Upgrade a single clog segment to add a page header on each page.
+ */
+static void
+upgrade_file(const char *src_dir, const char *src_file, const char *dst_dir)
+{
+ char src[MAXPGPATH];
+ char dst[MAXPGPATH];
+
+ int seg_name_len;
+ int src_segno;
+ int64 src_pageno;
+ int dst_segno;
+ int64 dst_pageno;
+ int dst_offset;
+
+ int src_fd;
+ int dst_fd;
+
+ char *src_buf;
+ ssize_t src_len;
+ ssize_t src_buf_offset;
+ PGAlignedBlock dst_block;
+ Page page = dst_block.data;
+ int len_to_copy;
+
+ seg_name_len = strlen(src_file);
+ src_segno = (int) strtol(src_file, NULL, 16);
+ src_pageno = src_segno * SLRU_PAGES_PER_SEGMENT;
+
+ dst_pageno = src_pageno * BLCKSZ / SizeOfPageContents;
+ dst_offset = src_pageno * BLCKSZ - dst_pageno * SizeOfPageContents;
+ dst_segno = dst_pageno / SLRU_PAGES_PER_SEGMENT;
+
+ snprintf(src, sizeof(src), "%s/%s", src_dir, src_file);
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+
+ src_buf = pg_malloc(SEGMENT_SIZE);
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) == -1)
+ pg_fatal("could not open file \"%s\": %s", src, strerror(errno));
+ if ((src_len = read(src_fd, src_buf, SEGMENT_SIZE)) == -1)
+ pg_fatal("could not read file \"%s\": %s", src, strerror(errno));
+
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+
+ /*
+ * Read the destination page at dst_pageno into the buffer. The page may contain
+ * data from the previous source segment. Initialize the page if the page is new.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+ if (read(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not read file \"%s\": %s", dst, strerror(errno));
+ if (PageIsNew(page))
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Rewind the file position, so the first write will overwrite the page.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+
+ src_buf_offset = 0;
+ while (src_buf_offset < src_len)
+ {
+ len_to_copy = Min(src_len - src_buf_offset, SizeOfPageContents - dst_offset);
+ memcpy(PageGetContents(page) + dst_offset, src_buf + src_buf_offset, len_to_copy);
+ src_buf_offset += len_to_copy;
+
+ if (new_cluster.controldata.data_checksum_version > 0)
+ ((PageHeader) page)->pd_checksum = pg_checksum_page(page, dst_pageno);
+ if (write(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not write file \"%s\": %s", dst, strerror(errno));
+
+ dst_pageno++;
+ dst_offset = 0;
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Switch segments if we reached the end of the current segment.
+ */
+ if (dst_pageno % SLRU_PAGES_PER_SEGMENT == 0)
+ {
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ dst_segno++;
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+ }
+ }
+
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ pg_free(src_buf);
+ close(src_fd);
+}
+
+/*
+ * Upgrade the clog files to add a page header to each SLRU page.
+ */
+void
+upgrade_xact_cache(const char *src_subdir, const char *dst_subdir)
+{
+ char src_dir[MAXPGPATH];
+ char dst_dir[MAXPGPATH];
+
+ DIR *src_dirp;
+ struct dirent *src_dirent;
+
+ snprintf(src_dir, sizeof(src_dir), "%s/%s", old_cluster.pgdata, src_subdir);
+ snprintf(dst_dir, sizeof(dst_dir), "%s/%s", new_cluster.pgdata, dst_subdir);
+
+ if ((src_dirp = opendir(src_dir)) == NULL)
+ pg_fatal("could not open directory \"%s\": %s", src_dir, strerror(errno));
+
+ while (errno = 0, (src_dirent = readdir(src_dirp)) != NULL)
+ {
+ if (segment_file_filter(src_dirent))
+ upgrade_file(src_dir, src_dirent->d_name, dst_dir);
+ }
+
+ if (closedir(src_dirp) != 0)
+ pg_fatal("could not close directory \"%s\": %s", src_dir, strerror(errno));
+}
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index bb261353bd..7472c44f8d 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -674,14 +674,28 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir)
static void
copy_xact_xlog_xid(void)
{
+ bool slru_header_changed = false;
+
/*
* Copy old commit logs to new data dir. pg_clog has been renamed to
* pg_xact in post-10 clusters.
*/
- copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact",
- GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact");
+ char *xact_old_dir = GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+ char *xact_new_dir = GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+
+ /*
+ * In post-17 clusters, a page header is added to each SLRU page.
+ * Perform a one-time conversion of the clog files if the old
+ * cluster and the new cluster use different SLRU formats.
+ */
+ if (new_cluster.controldata.cat_ver >= SLRU_PAGE_HEADER_CAT_VER &&
+ old_cluster.controldata.cat_ver < SLRU_PAGE_HEADER_CAT_VER)
+ slru_header_changed = true;
+
+ if (slru_header_changed)
+ upgrade_xact_cache(xact_old_dir, xact_new_dir);
+ else
+ copy_subdir_files(xact_old_dir, xact_new_dir);
prep_status("Setting oldest XID for new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
@@ -716,7 +730,8 @@ copy_xact_xlog_xid(void)
* server doesn't attempt to read multis older than the cutoff value.
*/
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
- new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
+ !slru_header_changed)
{
copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
copy_subdir_files("pg_multixact/members", "pg_multixact/members");
@@ -736,7 +751,8 @@ copy_xact_xlog_xid(void)
new_cluster.pgdata);
check_ok();
}
- else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER ||
+ slru_header_changed)
{
/*
* Remove offsets/0000 file created by initdb that no longer matches
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index c0bfb002d2..989f428e97 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -115,6 +115,11 @@ extern char *output_files[];
*/
#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
+/*
+ * A page header was added to each SLRU page in 17.0.
+ */
+#define SLRU_PAGE_HEADER_CAT_VER 202403111
+
/*
* large object chunk size added to pg_controldata,
* commit 5f93c37805e7485488480916b4585e098d3cc883
@@ -412,6 +417,7 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile,
void check_file_clone(void);
void check_copy_file_range(void);
void check_hard_link(void);
+void upgrade_xact_cache(const char *src_subdir, const char *dst_subdir);
/* fopen_priv() is no longer different from fopen() */
#define fopen_priv(path, mode) fopen(path, mode)
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index f085221155..0c8349f50e 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202403091
+#define CATALOG_VERSION_NO 202403111
#endif
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index d0df02d39c..2dc83451a6 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -201,6 +201,7 @@ typedef PageHeaderData *PageHeader;
* handling pages.
*/
#define PG_PAGE_LAYOUT_VERSION 4
+#define PG_SLRU_PAGE_LAYOUT_VERSION 1
#define PG_DATA_CHECKSUM_VERSION 1
/* ----------------------------------------------------------------
@@ -257,6 +258,11 @@ PageGetContents(Page page)
return (char *) page + MAXALIGN(SizeOfPageHeaderData);
}
+/*
+ * Space available for storing page contents.
+ */
+#define SizeOfPageContents (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
/* ----------------
* functions to access page size info
* ----------------
@@ -486,6 +492,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
"BLCKSZ has to be a multiple of sizeof(size_t)");
extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern void PageInitSLRU(Page page, Size pageSize, Size specialSize);
extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags);
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
OffsetNumber offsetNumber, int flags);
diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c
index 068a21f125..06cf7656f7 100644
--- a/src/test/modules/test_slru/test_slru.c
+++ b/src/test/modules/test_slru/test_slru.c
@@ -17,6 +17,7 @@
#include "access/slru.h"
#include "access/transam.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
@@ -72,8 +73,8 @@ test_slru_page_write(PG_FUNCTION_ARGS)
TestSlruCtl->shared->page_status[slotno] = SLRU_PAGE_VALID;
/* write given data to the page, up to the limit of the page */
- strncpy(TestSlruCtl->shared->page_buffer[slotno], data,
- BLCKSZ - 1);
+ strncpy(PageGetContents(TestSlruCtl->shared->page_buffer[slotno]), data,
+ SizeOfPageContents - 1);
SimpleLruWritePage(TestSlruCtl, slotno);
LWLockRelease(lock);
@@ -101,7 +102,7 @@ test_slru_page_read(PG_FUNCTION_ARGS)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(TestSlruCtl, pageno,
write_ok, InvalidTransactionId);
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(lock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -120,7 +121,7 @@ test_slru_page_readonly(PG_FUNCTION_ARGS)
pageno,
InvalidTransactionId);
Assert(LWLockHeldByMe(lock));
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(lock);
PG_RETURN_TEXT_P(cstring_to_text(data));
On Mon, 2024-03-11 at 10:01 +0000, Li, Yong wrote:
- The clog LSN group has been brought back.
Now the page LSN on each clog page is used for honoring the write-
ahead rule
and it is always the highest LSN of all the LSN groups on the page.
The LSN groups are used by TransactionIdGetStatus() as before.
I like where this is going.
Álvaro, do you still see a problem with this approach?
- New comments have been added to pg_upgrade to mention the SLRU
page header change as the reason for upgrading clog files.
That seems reasonable, but were any alternatives discussed? Do we have
consensus that this is the right thing to do?
And if we use this approach, is there extra validation or testing that
can be done?
Regards,
Jeff Davis
- New comments have been added to pg_upgrade to mention the SLRU
page header change as the reason for upgrading clog files.That seems reasonable, but were any alternatives discussed? Do we have
consensus that this is the right thing to do?
In general, there are two approaches. Either we convert the existing clog files,
or we don’t. The patch chooses to convert.
If we don’t, then the clog file code must be able to handle both formats. For,
XIDs in the range where the clog is written in the old format, segment and offset
computation must be done in one way, and for XIDs in a different range, it must
be computed in a different way. To avoid changing the format in the middle of a
page, which must not happen, the new format must start from a clean page,
possibly in a clean new segment. If the database is extremely small and has only
a few transactions on the first page of clog, then we must either convert the whole
page (effectively the whole clog file), or we must skip the rest of the XIDs on the
page and ask the database to start from XIDs on the second page on restart.
Also, we need to consider where to store the cut-off XID and when to remove it.
All these details feel very complex and error prone to me. Performing a one-time
conversion is the most efficient and straightforward approach to me.
And if we use this approach, is there extra validation or testing that
can be done?Regards,
Jeff Davis
Unfortunately, the test requires a setup of two different versions of PG. I am not
aware of an existing test infrastructure which can run automated tests using two
PGs. I did manually verify the output of pg_upgrade.
Regards,
Yong
Hi,
On Tue, Mar 19, 2024 at 06:48:33AM +0000, Li, Yong wrote:
Unfortunately, the test requires a setup of two different versions of PG. I am not
aware of an existing test infrastructure which can run automated tests using two
PGs. I did manually verify the output of pg_upgrade.
I think there is something in t/002_pg_upgrade.pl (see src/bin/pg_upgrade/TESTING),
that could be used to run automated tests using an old and a current version.
Regards,
--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
On Mon, Jun 10, 2024 at 07:19:56AM +0000, Bertrand Drouvot wrote:
On Tue, Mar 19, 2024 at 06:48:33AM +0000, Li, Yong wrote:
Unfortunately, the test requires a setup of two different versions of PG. I am not
aware of an existing test infrastructure which can run automated tests using two
PGs. I did manually verify the output of pg_upgrade.I think there is something in t/002_pg_upgrade.pl (see src/bin/pg_upgrade/TESTING),
that could be used to run automated tests using an old and a current version.
Cluster.pm relies on install_path for stuff, where it is possible to
create tests with multiple nodes pointing to different installation
paths. This allows mixing nodes with different build options, or just
different major versions like pg_upgrade's perl tests.
--
Michael
On Jun 10, 2024, at 16:01, Michael Paquier <michael@paquier.xyz> wrote:
External Email
From: Michael Paquier <michael@paquier.xyz>
Subject: Re: Proposal to add page headers to SLRU pages
Date: June 10, 2024 at 16:01:50 GMT+8
To: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Cc: "Li, Yong" <yoli@ebay.com>, Jeff Davis <pgsql@j-davis.com>, Aleksander Alekseev <aleksander@timescale.com>, PostgreSQL Hackers <pgsql-hackers@lists.postgresql.org>, "Bagga, Rishu" <bagrishu@amazon.com>, Robert Haas <robertmhaas@gmail.com>, Andrey Borodin <x4mmm@yandex-team.ru>, "Shyrabokau, Anton" <antons@ebay.com>On Mon, Jun 10, 2024 at 07:19:56AM +0000, Bertrand Drouvot wrote:
On Tue, Mar 19, 2024 at 06:48:33AM +0000, Li, Yong wrote:
Unfortunately, the test requires a setup of two different versions of PG. I am not
aware of an existing test infrastructure which can run automated tests using two
PGs. I did manually verify the output of pg_upgrade.I think there is something in t/002_pg_upgrade.pl (see src/bin/pg_upgrade/TESTING),
that could be used to run automated tests using an old and a current version.Cluster.pm relies on install_path for stuff, where it is possible to
create tests with multiple nodes pointing to different installation
paths. This allows mixing nodes with different build options, or just
different major versions like pg_upgrade's perl tests.
—
Michael
Thanks for pointing this out. Here is what I have tried:
1. Manually build and install PostgreSQL from the latest source code.
2. Following the instructions from src/bin/pg_upgrade to manually dump the regression database.
3. Apply the patch to the latest code, and build from the source.
4. Run “make check” by following the instructions from src/bin/pg_upgrade and setting up the olddump and oldinstall to point to the “old” installation used in step 2.
All tests pass.
Yong
Thanks for pointing this out. Here is what I have tried:
1. Manually build and install PostgreSQL from the latest source code.
2. Following the instructions from src/bin/pg_upgrade to manually dump the regression
database.
3. Apply the patch to the latest code, and build from the source.
4. Run “make check” by following the instructions from src/bin/pg_upgrade and setting up
the olddump and oldinstall to point to the “old” installation used in step 2.
All tests pass.
Hi all,
Following up on this. What remaining work do we need to do to get this in?
Thanks,
Rishu Bagga
Show quoted text
On Sat, Nov 9, 2024 at 8:50 AM Li, Yong <yoli@ebay.com> wrote:
On Jun 10, 2024, at 16:01, Michael Paquier <michael@paquier.xyz> wrote:
External Email
From: Michael Paquier <michael@paquier.xyz>
Subject: Re: Proposal to add page headers to SLRU pages
Date: June 10, 2024 at 16:01:50 GMT+8
To: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Cc: "Li, Yong" <yoli@ebay.com>, Jeff Davis <pgsql@j-davis.com>, Aleksander Alekseev <aleksander@timescale.com>, PostgreSQL Hackers <pgsql-hackers@lists.postgresql.org>, "Bagga, Rishu" <bagrishu@amazon.com>, Robert Haas <robertmhaas@gmail.com>, Andrey Borodin <x4mmm@yandex-team.ru>, "Shyrabokau, Anton" <antons@ebay.com>On Mon, Jun 10, 2024 at 07:19:56AM +0000, Bertrand Drouvot wrote:
On Tue, Mar 19, 2024 at 06:48:33AM +0000, Li, Yong wrote:
Unfortunately, the test requires a setup of two different versions of PG. I am not
aware of an existing test infrastructure which can run automated tests using two
PGs. I did manually verify the output of pg_upgrade.I think there is something in t/002_pg_upgrade.pl (see src/bin/pg_upgrade/TESTING),
that could be used to run automated tests using an old and a current version.Cluster.pm relies on install_path for stuff, where it is possible to
create tests with multiple nodes pointing to different installation
paths. This allows mixing nodes with different build options, or just
different major versions like pg_upgrade's perl tests.
—
MichaelThanks for pointing this out. Here is what I have tried:
1. Manually build and install PostgreSQL from the latest source code.
2. Following the instructions from src/bin/pg_upgrade to manually dump the regression database.
3. Apply the patch to the latest code, and build from the source.
4. Run “make check” by following the instructions from src/bin/pg_upgrade and setting up the olddump and oldinstall to point to the “old” installation used in step 2.All tests pass.
Yong
On Nov 10, 2024, at 01:32, Rishu Bagga <rishu.postgres@gmail.com> wrote:
External Email
Thanks for pointing this out. Here is what I have tried:
1. Manually build and install PostgreSQL from the latest source code.
2. Following the instructions from src/bin/pg_upgrade to manually dump the regression
database.
3. Apply the patch to the latest code, and build from the source.
4. Run “make check” by following the instructions from src/bin/pg_upgrade and setting up
the olddump and oldinstall to point to the “old” installation used in step 2.All tests pass.
Hi all,
Following up on this. What remaining work do we need to do to get this in?
Thanks,
Rishu Bagga
Thanks for taking interest in this proposal. There is no remaining work for this proposal.
It’s now “waiting for review”. It would be great if you can provide a review report, so we
can change the status to “ready for commit”.
I’ve updated the patch against the latest HEAD.
Yong
Attachments:
v7-0001-SLRU-header.patchapplication/octet-stream; name=v7-0001-SLRU-header.patchDownload
From 2da7ecd75c4c8628eb6714057f304ec99007c3db Mon Sep 17 00:00:00 2001
From: yoli <yoli@ebay.com>
Date: Mon, 11 Nov 2024 00:50:04 -0700
Subject: [PATCH vN] This patch adds the standard PageHeaderData to each SLRU
page.
By having a standard page header, we gain the following improvements:
* Each page has a checksum, so we can detect page corruption on page read.
* Each page has a page LSN, so on page write, instead of iterating through
group LSNs and taking the maximum we can easily use the LSN in the header
to enforce Write-Ahead-Logging.
The change set also includes modifications to pg_upgrade to
convert on-disk SLRU files.
---
src/backend/access/transam/clog.c | 45 +-
src/backend/access/transam/commit_ts.c | 1321 +++++++++++-------------
src/backend/access/transam/multixact.c | 55 +-
src/backend/access/transam/slru.c | 68 +-
src/backend/access/transam/subtrans.c | 8 +-
src/backend/commands/async.c | 23 +-
src/backend/storage/lmgr/predicate.c | 12 +-
src/backend/storage/page/bufpage.c | 25 +
src/bin/pg_checksums/pg_checksums.c | 9 +
src/bin/pg_resetwal/t/001_basic.pl | 6 +-
src/bin/pg_upgrade/file.c | 178 ++++
src/bin/pg_upgrade/pg_upgrade.c | 28 +-
src/bin/pg_upgrade/pg_upgrade.h | 6 +
src/include/storage/bufpage.h | 7 +
src/test/modules/test_slru/test_slru.c | 9 +-
15 files changed, 994 insertions(+), 806 deletions(-)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index e6f79320e9..4c7acde0cc 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -42,6 +42,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/proc.h"
#include "storage/sync.h"
#include "utils/guc_hooks.h"
@@ -61,7 +62,7 @@
/* We need two bits per xact, so four xacts fit in a byte */
#define CLOG_BITS_PER_XACT 2
#define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACTS_PER_PAGE (SizeOfPageContents * CLOG_XACTS_PER_BYTE)
#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
/*
@@ -90,7 +91,13 @@ TransactionIdToPage(TransactionId xid)
/* We store the latest async LSN for each group of transactions */
#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
-#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
+
+/*
+ * Use BLCKSZ instead of SizeOfPageContents so that CLOG_LSNS_PER_PAGE is
+ * a power of 2. Using BLCKSZ wastes the last 4 LSN groups per page, but
+ * this is acceptable given that each page has 1,024 LSN groups.
+ */
+#define CLOG_LSNS_PER_PAGE ((BLCKSZ * CLOG_XACTS_PER_BYTE) / CLOG_XACTS_PER_LSN_GROUP)
#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
@@ -112,7 +119,7 @@ static SlruCtlData XactCtlData;
static int ZeroCLOGPage(int64 pageno, bool writeXlog);
static bool CLOGPagePrecedes(int64 page1, int64 page2);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
Oid oldestXactDb);
static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
@@ -665,13 +672,15 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
char *byteptr;
char byteval;
char curval;
+ Page page;
Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(xid));
Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl,
XactCtl->shared->page_number[slotno]),
LW_EXCLUSIVE));
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
/*
@@ -700,7 +709,7 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*byteptr = byteval;
/*
- * Update the group LSN if the transaction completion LSN is higher.
+ * Update the page & group LSN if the transaction completion LSN is higher.
*
* Note: lsn will be invalid when supplied during InRecovery processing,
* so we don't need to do anything special to avoid LSN updates during
@@ -709,10 +718,13 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*/
if (!XLogRecPtrIsInvalid(lsn))
{
- int lsnindex = GetLSNIndex(slotno, xid);
+ int lsnindex = GetLSNIndex(slotno, xid);
if (XactCtl->shared->group_lsn[lsnindex] < lsn)
XactCtl->shared->group_lsn[lsnindex] = lsn;
+
+ if (PageGetLSN(page) < lsn)
+ PageSetLSN(page, lsn);
}
}
@@ -739,13 +751,15 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
int slotno;
int lsnindex;
+ Page page;
char *byteptr;
XidStatus status;
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
@@ -860,11 +874,17 @@ static int
ZeroCLOGPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(XactCtl, pageno);
+ page = XactCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -917,12 +937,12 @@ TrimCLOG(void)
char *byteptr;
slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(XactCtl->shared->page_buffer[slotno]) + byteno;
/* Zero so-far-unused positions in the current byte */
*byteptr &= (1 << bshift) - 1;
/* Zero the rest of the page */
- MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+ MemSet(byteptr + 1, 0, SizeOfPageContents - byteno - 1);
XactCtl->shared->page_dirty[slotno] = true;
}
@@ -946,7 +966,6 @@ CheckPointCLOG(void)
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that CLOG has room for a newly-allocated XID.
*
@@ -1070,12 +1089,12 @@ CLOGPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
+ return XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 77e1899d7a..f8d301c447 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -29,6 +29,7 @@
#include "access/xlogutils.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/shmem.h"
#include "utils/fmgrprotos.h"
#include "utils/guc_hooks.h"
@@ -51,31 +52,27 @@
* the largest possible file name is more than 5 chars long; see
* SlruScanDirectory.
*/
-typedef struct CommitTimestampEntry
-{
- TimestampTz time;
- RepOriginId nodeid;
+typedef struct CommitTimestampEntry {
+ TimestampTz time;
+ RepOriginId nodeid;
} CommitTimestampEntry;
-#define SizeOfCommitTimestampEntry (offsetof(CommitTimestampEntry, nodeid) + \
- sizeof(RepOriginId))
-
-#define COMMIT_TS_XACTS_PER_PAGE \
- (BLCKSZ / SizeOfCommitTimestampEntry)
+#define SizeOfCommitTimestampEntry \
+ (offsetof(CommitTimestampEntry, nodeid) + sizeof(RepOriginId))
+#define COMMIT_TS_XACTS_PER_PAGE \
+ (SizeOfPageContents / SizeOfCommitTimestampEntry)
/*
* Although we return an int64 the actual value can't currently exceed
* 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE.
*/
-static inline int64
-TransactionIdToCTsPage(TransactionId xid)
-{
- return xid / (int64) COMMIT_TS_XACTS_PER_PAGE;
+static inline int64 TransactionIdToCTsPage(TransactionId xid) {
+ return xid / (int64)COMMIT_TS_XACTS_PER_PAGE;
}
-#define TransactionIdToCTsEntry(xid) \
- ((xid) % (TransactionId) COMMIT_TS_XACTS_PER_PAGE)
+#define TransactionIdToCTsEntry(xid) \
+ ((xid) % (TransactionId)COMMIT_TS_XACTS_PER_PAGE)
/*
* Link to shared-memory data structures for CommitTs control
@@ -95,30 +92,28 @@ static SlruCtlData CommitTsCtlData;
* without acquiring the lock; where this happens, a comment explains the
* rationale for it.
*/
-typedef struct CommitTimestampShared
-{
- TransactionId xidLastCommit;
- CommitTimestampEntry dataLastCommit;
- bool commitTsActive;
+typedef struct CommitTimestampShared {
+ TransactionId xidLastCommit;
+ CommitTimestampEntry dataLastCommit;
+ bool commitTsActive;
} CommitTimestampShared;
static CommitTimestampShared *commitTsShared;
-
/* GUC variable */
-bool track_commit_timestamp;
+bool track_commit_timestamp;
static void SetXidCommitTsInPage(TransactionId xid, int nsubxids,
- TransactionId *subxids, TimestampTz ts,
- RepOriginId nodeid, int64 pageno);
+ TransactionId *subxids, TimestampTz ts,
+ RepOriginId nodeid, int64 pageno);
static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
- RepOriginId nodeid, int slotno);
+ RepOriginId nodeid, int slotno);
static void error_commit_ts_disabled(void);
-static int ZeroCommitTsPage(int64 pageno, bool writeXlog);
+static int ZeroCommitTsPage(int64 pageno, bool writeXlog);
static bool CommitTsPagePrecedes(int64 page1, int64 page2);
static void ActivateCommitTs(void);
static void DeactivateCommitTs(void);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid);
/*
@@ -137,107 +132,101 @@ static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid);
* subtrans implementation changes in the future, we might want to revisit the
* decision of storing timestamp info for each subxid.
*/
-void
-TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids,
- TransactionId *subxids, TimestampTz timestamp,
- RepOriginId nodeid)
-{
- int i;
- TransactionId headxid;
- TransactionId newestXact;
-
- /*
- * No-op if the module is not active.
- *
- * An unlocked read here is fine, because in a standby (the only place
- * where the flag can change in flight) this routine is only called by the
- * recovery process, which is also the only process which can change the
- * flag.
- */
- if (!commitTsShared->commitTsActive)
- return;
-
- /*
- * Figure out the latest Xid in this batch: either the last subxid if
- * there's any, otherwise the parent xid.
- */
- if (nsubxids > 0)
- newestXact = subxids[nsubxids - 1];
- else
- newestXact = xid;
-
- /*
- * We split the xids to set the timestamp to in groups belonging to the
- * same SLRU page; the first element in each such set is its head. The
- * first group has the main XID as the head; subsequent sets use the first
- * subxid not on the previous page as head. This way, we only have to
- * lock/modify each SLRU page once.
- */
- headxid = xid;
- i = 0;
- for (;;)
- {
- int64 pageno = TransactionIdToCTsPage(headxid);
- int j;
-
- for (j = i; j < nsubxids; j++)
- {
- if (TransactionIdToCTsPage(subxids[j]) != pageno)
- break;
- }
- /* subxids[i..j] are on the same page as the head */
-
- SetXidCommitTsInPage(headxid, j - i, subxids + i, timestamp, nodeid,
- pageno);
-
- /* if we wrote out all subxids, we're done. */
- if (j >= nsubxids)
- break;
-
- /*
- * Set the new head and skip over it, as well as over the subxids we
- * just wrote.
- */
- headxid = subxids[j];
- i = j + 1;
- }
-
- /* update the cached value in shared memory */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- commitTsShared->xidLastCommit = xid;
- commitTsShared->dataLastCommit.time = timestamp;
- commitTsShared->dataLastCommit.nodeid = nodeid;
-
- /* and move forwards our endpoint, if needed */
- if (TransactionIdPrecedes(TransamVariables->newestCommitTsXid, newestXact))
- TransamVariables->newestCommitTsXid = newestXact;
- LWLockRelease(CommitTsLock);
+void TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids,
+ TransactionId *subxids,
+ TimestampTz timestamp, RepOriginId nodeid) {
+ int i;
+ TransactionId headxid;
+ TransactionId newestXact;
+
+ /*
+ * No-op if the module is not active.
+ *
+ * An unlocked read here is fine, because in a standby (the only place
+ * where the flag can change in flight) this routine is only called by the
+ * recovery process, which is also the only process which can change the
+ * flag.
+ */
+ if (!commitTsShared->commitTsActive)
+ return;
+
+ /*
+ * Figure out the latest Xid in this batch: either the last subxid if
+ * there's any, otherwise the parent xid.
+ */
+ if (nsubxids > 0)
+ newestXact = subxids[nsubxids - 1];
+ else
+ newestXact = xid;
+
+ /*
+ * We split the xids to set the timestamp to in groups belonging to the
+ * same SLRU page; the first element in each such set is its head. The
+ * first group has the main XID as the head; subsequent sets use the first
+ * subxid not on the previous page as head. This way, we only have to
+ * lock/modify each SLRU page once.
+ */
+ headxid = xid;
+ i = 0;
+ for (;;) {
+ int64 pageno = TransactionIdToCTsPage(headxid);
+ int j;
+
+ for (j = i; j < nsubxids; j++) {
+ if (TransactionIdToCTsPage(subxids[j]) != pageno)
+ break;
+ }
+ /* subxids[i..j] are on the same page as the head */
+
+ SetXidCommitTsInPage(headxid, j - i, subxids + i, timestamp, nodeid,
+ pageno);
+
+ /* if we wrote out all subxids, we're done. */
+ if (j >= nsubxids)
+ break;
+
+ /*
+ * Set the new head and skip over it, as well as over the subxids we
+ * just wrote.
+ */
+ headxid = subxids[j];
+ i = j + 1;
+ }
+
+ /* update the cached value in shared memory */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ commitTsShared->xidLastCommit = xid;
+ commitTsShared->dataLastCommit.time = timestamp;
+ commitTsShared->dataLastCommit.nodeid = nodeid;
+
+ /* and move forwards our endpoint, if needed */
+ if (TransactionIdPrecedes(TransamVariables->newestCommitTsXid, newestXact))
+ TransamVariables->newestCommitTsXid = newestXact;
+ LWLockRelease(CommitTsLock);
}
/*
* Record the commit timestamp of transaction entries in the commit log for all
* entries on a single page. Atomic only on this page.
*/
-static void
-SetXidCommitTsInPage(TransactionId xid, int nsubxids,
- TransactionId *subxids, TimestampTz ts,
- RepOriginId nodeid, int64 pageno)
-{
- LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
- int slotno;
- int i;
+static void SetXidCommitTsInPage(TransactionId xid, int nsubxids,
+ TransactionId *subxids, TimestampTz ts,
+ RepOriginId nodeid, int64 pageno) {
+ LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
+ int slotno;
+ int i;
- LWLockAcquire(lock, LW_EXCLUSIVE);
+ LWLockAcquire(lock, LW_EXCLUSIVE);
- slotno = SimpleLruReadPage(CommitTsCtl, pageno, true, xid);
+ slotno = SimpleLruReadPage(CommitTsCtl, pageno, true, xid);
- TransactionIdSetCommitTs(xid, ts, nodeid, slotno);
- for (i = 0; i < nsubxids; i++)
- TransactionIdSetCommitTs(subxids[i], ts, nodeid, slotno);
+ TransactionIdSetCommitTs(xid, ts, nodeid, slotno);
+ for (i = 0; i < nsubxids; i++)
+ TransactionIdSetCommitTs(subxids[i], ts, nodeid, slotno);
- CommitTsCtl->shared->page_dirty[slotno] = true;
+ CommitTsCtl->shared->page_dirty[slotno] = true;
- LWLockRelease(lock);
+ LWLockRelease(lock);
}
/*
@@ -245,21 +234,19 @@ SetXidCommitTsInPage(TransactionId xid, int nsubxids,
*
* Caller must hold the correct SLRU bank lock, will be held at exit
*/
-static void
-TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
- RepOriginId nodeid, int slotno)
-{
- int entryno = TransactionIdToCTsEntry(xid);
- CommitTimestampEntry entry;
+static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
+ RepOriginId nodeid, int slotno) {
+ int entryno = TransactionIdToCTsEntry(xid);
+ CommitTimestampEntry entry;
- Assert(TransactionIdIsNormal(xid));
+ Assert(TransactionIdIsNormal(xid));
- entry.time = ts;
- entry.nodeid = nodeid;
+ entry.time = ts;
+ entry.nodeid = nodeid;
- memcpy(CommitTsCtl->shared->page_buffer[slotno] +
- SizeOfCommitTimestampEntry * entryno,
- &entry, SizeOfCommitTimestampEntry);
+ memcpy(PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
+ SizeOfCommitTimestampEntry * entryno,
+ &entry, SizeOfCommitTimestampEntry);
}
/*
@@ -270,82 +257,79 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
* null), and the origin node for the Xid is returned in *nodeid, if it's not
* null.
*/
-bool
-TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
- RepOriginId *nodeid)
-{
- int64 pageno = TransactionIdToCTsPage(xid);
- int entryno = TransactionIdToCTsEntry(xid);
- int slotno;
- CommitTimestampEntry entry;
- TransactionId oldestCommitTsXid;
- TransactionId newestCommitTsXid;
-
- if (!TransactionIdIsValid(xid))
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("cannot retrieve commit timestamp for transaction %u", xid)));
- else if (!TransactionIdIsNormal(xid))
- {
- /* frozen and bootstrap xids are always committed far in the past */
- *ts = 0;
- if (nodeid)
- *nodeid = 0;
- return false;
- }
-
- LWLockAcquire(CommitTsLock, LW_SHARED);
-
- /* Error if module not enabled */
- if (!commitTsShared->commitTsActive)
- error_commit_ts_disabled();
-
- /*
- * If we're asked for the cached value, return that. Otherwise, fall
- * through to read from SLRU.
- */
- if (commitTsShared->xidLastCommit == xid)
- {
- *ts = commitTsShared->dataLastCommit.time;
- if (nodeid)
- *nodeid = commitTsShared->dataLastCommit.nodeid;
-
- LWLockRelease(CommitTsLock);
- return *ts != 0;
- }
-
- oldestCommitTsXid = TransamVariables->oldestCommitTsXid;
- newestCommitTsXid = TransamVariables->newestCommitTsXid;
- /* neither is invalid, or both are */
- Assert(TransactionIdIsValid(oldestCommitTsXid) == TransactionIdIsValid(newestCommitTsXid));
- LWLockRelease(CommitTsLock);
-
- /*
- * Return empty if the requested value is outside our valid range.
- */
- if (!TransactionIdIsValid(oldestCommitTsXid) ||
- TransactionIdPrecedes(xid, oldestCommitTsXid) ||
- TransactionIdPrecedes(newestCommitTsXid, xid))
- {
- *ts = 0;
- if (nodeid)
- *nodeid = InvalidRepOriginId;
- return false;
- }
-
- /* lock is acquired by SimpleLruReadPage_ReadOnly */
- slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
- memcpy(&entry,
- CommitTsCtl->shared->page_buffer[slotno] +
- SizeOfCommitTimestampEntry * entryno,
- SizeOfCommitTimestampEntry);
-
- *ts = entry.time;
- if (nodeid)
- *nodeid = entry.nodeid;
-
- LWLockRelease(SimpleLruGetBankLock(CommitTsCtl, pageno));
- return *ts != 0;
+bool TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
+ RepOriginId *nodeid) {
+ int64 pageno = TransactionIdToCTsPage(xid);
+ int entryno = TransactionIdToCTsEntry(xid);
+ int slotno;
+ CommitTimestampEntry entry;
+ TransactionId oldestCommitTsXid;
+ TransactionId newestCommitTsXid;
+
+ if (!TransactionIdIsValid(xid))
+ ereport(
+ ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("cannot retrieve commit timestamp for transaction %u", xid)));
+ else if (!TransactionIdIsNormal(xid)) {
+ /* frozen and bootstrap xids are always committed far in the past */
+ *ts = 0;
+ if (nodeid)
+ *nodeid = 0;
+ return false;
+ }
+
+ LWLockAcquire(CommitTsLock, LW_SHARED);
+
+ /* Error if module not enabled */
+ if (!commitTsShared->commitTsActive)
+ error_commit_ts_disabled();
+
+ /*
+ * If we're asked for the cached value, return that. Otherwise, fall
+ * through to read from SLRU.
+ */
+ if (commitTsShared->xidLastCommit == xid) {
+ *ts = commitTsShared->dataLastCommit.time;
+ if (nodeid)
+ *nodeid = commitTsShared->dataLastCommit.nodeid;
+
+ LWLockRelease(CommitTsLock);
+ return *ts != 0;
+ }
+
+ oldestCommitTsXid = TransamVariables->oldestCommitTsXid;
+ newestCommitTsXid = TransamVariables->newestCommitTsXid;
+ /* neither is invalid, or both are */
+ Assert(TransactionIdIsValid(oldestCommitTsXid) ==
+ TransactionIdIsValid(newestCommitTsXid));
+ LWLockRelease(CommitTsLock);
+
+ /*
+ * Return empty if the requested value is outside our valid range.
+ */
+ if (!TransactionIdIsValid(oldestCommitTsXid) ||
+ TransactionIdPrecedes(xid, oldestCommitTsXid) ||
+ TransactionIdPrecedes(newestCommitTsXid, xid)) {
+ *ts = 0;
+ if (nodeid)
+ *nodeid = InvalidRepOriginId;
+ return false;
+ }
+
+ /* lock is acquired by SimpleLruReadPage_ReadOnly */
+ slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
+ memcpy(&entry,
+ PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
+ SizeOfCommitTimestampEntry * entryno,
+ SizeOfCommitTimestampEntry);
+
+ *ts = entry.time;
+ if (nodeid)
+ *nodeid = entry.nodeid;
+
+ LWLockRelease(SimpleLruGetBankLock(CommitTsCtl, pageno));
+ return *ts != 0;
}
/*
@@ -356,59 +340,53 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
* ts and nodeid are filled with the corresponding data; they can be passed
* as NULL if not wanted.
*/
-TransactionId
-GetLatestCommitTsData(TimestampTz *ts, RepOriginId *nodeid)
-{
- TransactionId xid;
+TransactionId GetLatestCommitTsData(TimestampTz *ts, RepOriginId *nodeid) {
+ TransactionId xid;
- LWLockAcquire(CommitTsLock, LW_SHARED);
+ LWLockAcquire(CommitTsLock, LW_SHARED);
- /* Error if module not enabled */
- if (!commitTsShared->commitTsActive)
- error_commit_ts_disabled();
+ /* Error if module not enabled */
+ if (!commitTsShared->commitTsActive)
+ error_commit_ts_disabled();
- xid = commitTsShared->xidLastCommit;
- if (ts)
- *ts = commitTsShared->dataLastCommit.time;
- if (nodeid)
- *nodeid = commitTsShared->dataLastCommit.nodeid;
- LWLockRelease(CommitTsLock);
+ xid = commitTsShared->xidLastCommit;
+ if (ts)
+ *ts = commitTsShared->dataLastCommit.time;
+ if (nodeid)
+ *nodeid = commitTsShared->dataLastCommit.nodeid;
+ LWLockRelease(CommitTsLock);
- return xid;
+ return xid;
}
-static void
-error_commit_ts_disabled(void)
-{
- ereport(ERROR,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("could not get commit timestamp data"),
- RecoveryInProgress() ?
- errhint("Make sure the configuration parameter \"%s\" is set on the primary server.",
- "track_commit_timestamp") :
- errhint("Make sure the configuration parameter \"%s\" is set.",
- "track_commit_timestamp")));
+static void error_commit_ts_disabled(void) {
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not get commit timestamp data"),
+ RecoveryInProgress()
+ ? errhint("Make sure the configuration parameter \"%s\" is set "
+ "on the primary server.",
+ "track_commit_timestamp")
+ : errhint("Make sure the configuration parameter \"%s\" is set.",
+ "track_commit_timestamp")));
}
/*
* SQL-callable wrapper to obtain commit time of a transaction
*/
-Datum
-pg_xact_commit_timestamp(PG_FUNCTION_ARGS)
-{
- TransactionId xid = PG_GETARG_TRANSACTIONID(0);
- TimestampTz ts;
- bool found;
+Datum pg_xact_commit_timestamp(PG_FUNCTION_ARGS) {
+ TransactionId xid = PG_GETARG_TRANSACTIONID(0);
+ TimestampTz ts;
+ bool found;
- found = TransactionIdGetCommitTsData(xid, &ts, NULL);
+ found = TransactionIdGetCommitTsData(xid, &ts, NULL);
- if (!found)
- PG_RETURN_NULL();
+ if (!found)
+ PG_RETURN_NULL();
- PG_RETURN_TIMESTAMPTZ(ts);
+ PG_RETURN_TIMESTAMPTZ(ts);
}
-
/*
* pg_last_committed_xact
*
@@ -416,42 +394,37 @@ pg_xact_commit_timestamp(PG_FUNCTION_ARGS)
* committed transaction: transaction ID, timestamp and replication
* origin.
*/
-Datum
-pg_last_committed_xact(PG_FUNCTION_ARGS)
-{
- TransactionId xid;
- RepOriginId nodeid;
- TimestampTz ts;
- Datum values[3];
- bool nulls[3];
- TupleDesc tupdesc;
- HeapTuple htup;
-
- /* and construct a tuple with our data */
- xid = GetLatestCommitTsData(&ts, &nodeid);
-
- if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
- elog(ERROR, "return type must be a row type");
-
- if (!TransactionIdIsNormal(xid))
- {
- memset(nulls, true, sizeof(nulls));
- }
- else
- {
- values[0] = TransactionIdGetDatum(xid);
- nulls[0] = false;
-
- values[1] = TimestampTzGetDatum(ts);
- nulls[1] = false;
-
- values[2] = ObjectIdGetDatum((Oid) nodeid);
- nulls[2] = false;
- }
-
- htup = heap_form_tuple(tupdesc, values, nulls);
-
- PG_RETURN_DATUM(HeapTupleGetDatum(htup));
+Datum pg_last_committed_xact(PG_FUNCTION_ARGS) {
+ TransactionId xid;
+ RepOriginId nodeid;
+ TimestampTz ts;
+ Datum values[3];
+ bool nulls[3];
+ TupleDesc tupdesc;
+ HeapTuple htup;
+
+ /* and construct a tuple with our data */
+ xid = GetLatestCommitTsData(&ts, &nodeid);
+
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ if (!TransactionIdIsNormal(xid)) {
+ memset(nulls, true, sizeof(nulls));
+ } else {
+ values[0] = TransactionIdGetDatum(xid);
+ nulls[0] = false;
+
+ values[1] = TimestampTzGetDatum(ts);
+ nulls[1] = false;
+
+ values[2] = ObjectIdGetDatum((Oid)nodeid);
+ nulls[2] = false;
+ }
+
+ htup = heap_form_tuple(tupdesc, values, nulls);
+
+ PG_RETURN_DATUM(HeapTupleGetDatum(htup));
}
/*
@@ -460,39 +433,34 @@ pg_last_committed_xact(PG_FUNCTION_ARGS)
* SQL-callable wrapper to obtain commit timestamp and replication origin
* of a given transaction.
*/
-Datum
-pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS)
-{
- TransactionId xid = PG_GETARG_TRANSACTIONID(0);
- RepOriginId nodeid;
- TimestampTz ts;
- Datum values[2];
- bool nulls[2];
- TupleDesc tupdesc;
- HeapTuple htup;
- bool found;
-
- found = TransactionIdGetCommitTsData(xid, &ts, &nodeid);
-
- if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
- elog(ERROR, "return type must be a row type");
-
- if (!found)
- {
- memset(nulls, true, sizeof(nulls));
- }
- else
- {
- values[0] = TimestampTzGetDatum(ts);
- nulls[0] = false;
-
- values[1] = ObjectIdGetDatum((Oid) nodeid);
- nulls[1] = false;
- }
-
- htup = heap_form_tuple(tupdesc, values, nulls);
-
- PG_RETURN_DATUM(HeapTupleGetDatum(htup));
+Datum pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS) {
+ TransactionId xid = PG_GETARG_TRANSACTIONID(0);
+ RepOriginId nodeid;
+ TimestampTz ts;
+ Datum values[2];
+ bool nulls[2];
+ TupleDesc tupdesc;
+ HeapTuple htup;
+ bool found;
+
+ found = TransactionIdGetCommitTsData(xid, &ts, &nodeid);
+
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ if (!found) {
+ memset(nulls, true, sizeof(nulls));
+ } else {
+ values[0] = TimestampTzGetDatum(ts);
+ nulls[0] = false;
+
+ values[1] = ObjectIdGetDatum((Oid)nodeid);
+ nulls[1] = false;
+ }
+
+ htup = heap_form_tuple(tupdesc, values, nulls);
+
+ PG_RETURN_DATUM(HeapTupleGetDatum(htup));
}
/*
@@ -502,88 +470,74 @@ pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS)
* Otherwise just cap the configured amount to be between 16 and the maximum
* allowed.
*/
-static int
-CommitTsShmemBuffers(void)
-{
- /* auto-tune based on shared buffers */
- if (commit_timestamp_buffers == 0)
- return SimpleLruAutotuneBuffers(512, 1024);
-
- return Min(Max(16, commit_timestamp_buffers), SLRU_MAX_ALLOWED_BUFFERS);
+static int CommitTsShmemBuffers(void) {
+ /* auto-tune based on shared buffers */
+ if (commit_timestamp_buffers == 0)
+ return SimpleLruAutotuneBuffers(512, 1024);
+
+ return Min(Max(16, commit_timestamp_buffers), SLRU_MAX_ALLOWED_BUFFERS);
}
/*
* Shared memory sizing for CommitTs
*/
-Size
-CommitTsShmemSize(void)
-{
- return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
- sizeof(CommitTimestampShared);
+Size CommitTsShmemSize(void) {
+ return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
+ sizeof(CommitTimestampShared);
}
/*
* Initialize CommitTs at system startup (postmaster start or standalone
* backend)
*/
-void
-CommitTsShmemInit(void)
-{
- bool found;
-
- /* If auto-tuning is requested, now is the time to do it */
- if (commit_timestamp_buffers == 0)
- {
- char buf[32];
-
- snprintf(buf, sizeof(buf), "%d", CommitTsShmemBuffers());
- SetConfigOption("commit_timestamp_buffers", buf, PGC_POSTMASTER,
- PGC_S_DYNAMIC_DEFAULT);
-
- /*
- * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
- * However, if the DBA explicitly set commit_timestamp_buffers = 0 in
- * the config file, then PGC_S_DYNAMIC_DEFAULT will fail to override
- * that and we must force the matter with PGC_S_OVERRIDE.
- */
- if (commit_timestamp_buffers == 0) /* failed to apply it? */
- SetConfigOption("commit_timestamp_buffers", buf, PGC_POSTMASTER,
- PGC_S_OVERRIDE);
- }
- Assert(commit_timestamp_buffers != 0);
-
- CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
- SimpleLruInit(CommitTsCtl, "commit_timestamp", CommitTsShmemBuffers(), 0,
- "pg_commit_ts", LWTRANCHE_COMMITTS_BUFFER,
- LWTRANCHE_COMMITTS_SLRU,
- SYNC_HANDLER_COMMIT_TS,
- false);
- SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE);
-
- commitTsShared = ShmemInitStruct("CommitTs shared",
- sizeof(CommitTimestampShared),
- &found);
-
- if (!IsUnderPostmaster)
- {
- Assert(!found);
-
- commitTsShared->xidLastCommit = InvalidTransactionId;
- TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
- commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
- commitTsShared->commitTsActive = false;
- }
- else
- Assert(found);
+void CommitTsShmemInit(void) {
+ bool found;
+
+ /* If auto-tuning is requested, now is the time to do it */
+ if (commit_timestamp_buffers == 0) {
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "%d", CommitTsShmemBuffers());
+ SetConfigOption("commit_timestamp_buffers", buf, PGC_POSTMASTER,
+ PGC_S_DYNAMIC_DEFAULT);
+
+ /*
+ * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
+ * However, if the DBA explicitly set commit_timestamp_buffers = 0 in
+ * the config file, then PGC_S_DYNAMIC_DEFAULT will fail to override
+ * that and we must force the matter with PGC_S_OVERRIDE.
+ */
+ if (commit_timestamp_buffers == 0) /* failed to apply it? */
+ SetConfigOption("commit_timestamp_buffers", buf, PGC_POSTMASTER,
+ PGC_S_OVERRIDE);
+ }
+ Assert(commit_timestamp_buffers != 0);
+
+ CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
+ SimpleLruInit(CommitTsCtl, "commit_timestamp", CommitTsShmemBuffers(), 0,
+ "pg_commit_ts", LWTRANCHE_COMMITTS_BUFFER,
+ LWTRANCHE_COMMITTS_SLRU, SYNC_HANDLER_COMMIT_TS, false);
+ SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE);
+
+ commitTsShared =
+ ShmemInitStruct("CommitTs shared", sizeof(CommitTimestampShared), &found);
+
+ if (!IsUnderPostmaster) {
+ Assert(!found);
+
+ commitTsShared->xidLastCommit = InvalidTransactionId;
+ TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
+ commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
+ commitTsShared->commitTsActive = false;
+ } else
+ Assert(found);
}
/*
* GUC check_hook for commit_timestamp_buffers
*/
-bool
-check_commit_ts_buffers(int *newval, void **extra, GucSource source)
-{
- return check_slru_buffers("commit_timestamp_buffers", newval);
+bool check_commit_ts_buffers(int *newval, void **extra, GucSource source) {
+ return check_slru_buffers("commit_timestamp_buffers", newval);
}
/*
@@ -592,14 +546,12 @@ check_commit_ts_buffers(int *newval, void **extra, GucSource source)
* (The CommitTs directory is assumed to have been created by initdb, and
* CommitTsShmemInit must have been called already.)
*/
-void
-BootStrapCommitTs(void)
-{
- /*
- * Nothing to do here at present, unlike most other SLRU modules; segments
- * are created when the server is started with this module enabled. See
- * ActivateCommitTs.
- */
+void BootStrapCommitTs(void) {
+ /*
+ * Nothing to do here at present, unlike most other SLRU modules; segments
+ * are created when the server is started with this module enabled. See
+ * ActivateCommitTs.
+ */
}
/*
@@ -611,84 +563,77 @@ BootStrapCommitTs(void)
*
* Control lock must be held at entry, and will be held at exit.
*/
-static int
-ZeroCommitTsPage(int64 pageno, bool writeXlog)
-{
- int slotno;
+static int ZeroCommitTsPage(int64 pageno, bool writeXlog) {
+ int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
- slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+ slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+ page = CommitTsCtl->shared->page_buffer[slotno];
- if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ if (writeXlog) {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
- return slotno;
+ return slotno;
}
/*
* This must be called ONCE during postmaster or standalone-backend startup,
* after StartupXLOG has initialized TransamVariables->nextXid.
*/
-void
-StartupCommitTs(void)
-{
- ActivateCommitTs();
-}
+void StartupCommitTs(void) { ActivateCommitTs(); }
/*
* This must be called ONCE during postmaster or standalone-backend startup,
* after recovery has finished.
*/
-void
-CompleteCommitTsInitialization(void)
-{
- /*
- * If the feature is not enabled, turn it off for good. This also removes
- * any leftover data.
- *
- * Conversely, we activate the module if the feature is enabled. This is
- * necessary for primary and standby as the activation depends on the
- * control file contents at the beginning of recovery or when a
- * XLOG_PARAMETER_CHANGE is replayed.
- */
- if (!track_commit_timestamp)
- DeactivateCommitTs();
- else
- ActivateCommitTs();
+void CompleteCommitTsInitialization(void) {
+ /*
+ * If the feature is not enabled, turn it off for good. This also removes
+ * any leftover data.
+ *
+ * Conversely, we activate the module if the feature is enabled. This is
+ * necessary for primary and standby as the activation depends on the
+ * control file contents at the beginning of recovery or when a
+ * XLOG_PARAMETER_CHANGE is replayed.
+ */
+ if (!track_commit_timestamp)
+ DeactivateCommitTs();
+ else
+ ActivateCommitTs();
}
/*
* Activate or deactivate CommitTs' upon reception of a XLOG_PARAMETER_CHANGE
* XLog record during recovery.
*/
-void
-CommitTsParameterChange(bool newvalue, bool oldvalue)
-{
- /*
- * If the commit_ts module is disabled in this server and we get word from
- * the primary server that it is enabled there, activate it so that we can
- * replay future WAL records involving it; also mark it as active on
- * pg_control. If the old value was already set, we already did this, so
- * don't do anything.
- *
- * If the module is disabled in the primary, disable it here too, unless
- * the module is enabled locally.
- *
- * Note this only runs in the recovery process, so an unlocked read is
- * fine.
- */
- if (newvalue)
- {
- if (!commitTsShared->commitTsActive)
- ActivateCommitTs();
- }
- else if (commitTsShared->commitTsActive)
- DeactivateCommitTs();
+void CommitTsParameterChange(bool newvalue, bool oldvalue) {
+ /*
+ * If the commit_ts module is disabled in this server and we get word from
+ * the primary server that it is enabled there, activate it so that we can
+ * replay future WAL records involving it; also mark it as active on
+ * pg_control. If the old value was already set, we already did this, so
+ * don't do anything.
+ *
+ * If the module is disabled in the primary, disable it here too, unless
+ * the module is enabled locally.
+ *
+ * Note this only runs in the recovery process, so an unlocked read is
+ * fine.
+ */
+ if (newvalue) {
+ if (!commitTsShared->commitTsActive)
+ ActivateCommitTs();
+ } else if (commitTsShared->commitTsActive)
+ DeactivateCommitTs();
}
/*
* Activate this module whenever necessary.
- * This must happen during postmaster or standalone-backend startup,
- * or during WAL replay anytime the track_commit_timestamp setting is
+ * This must happen during postmaster or standalone-backend
+ *startup, or during WAL replay anytime the track_commit_timestamp setting is
* changed in the primary.
*
* The reason why this SLRU needs separate activation/deactivation functions is
@@ -701,67 +646,62 @@ CommitTsParameterChange(bool newvalue, bool oldvalue)
* running with this module disabled for a while and thus might have skipped
* the normal creation point.
*/
-static void
-ActivateCommitTs(void)
-{
- TransactionId xid;
- int64 pageno;
-
- /* If we've done this already, there's nothing to do */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- if (commitTsShared->commitTsActive)
- {
- LWLockRelease(CommitTsLock);
- return;
- }
- LWLockRelease(CommitTsLock);
-
- xid = XidFromFullTransactionId(TransamVariables->nextXid);
- pageno = TransactionIdToCTsPage(xid);
-
- /*
- * Re-Initialize our idea of the latest page number.
- */
- pg_atomic_write_u64(&CommitTsCtl->shared->latest_page_number, pageno);
-
- /*
- * If CommitTs is enabled, but it wasn't in the previous server run, we
- * need to set the oldest and newest values to the next Xid; that way, we
- * will not try to read data that might not have been set.
- *
- * XXX does this have a problem if a server is started with commitTs
- * enabled, then started with commitTs disabled, then restarted with it
- * enabled again? It doesn't look like it does, because there should be a
- * checkpoint that sets the value to InvalidTransactionId at end of
- * recovery; and so any chance of injecting new transactions without
- * CommitTs values would occur after the oldestCommitTsXid has been set to
- * Invalid temporarily.
- */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- if (TransamVariables->oldestCommitTsXid == InvalidTransactionId)
- {
- TransamVariables->oldestCommitTsXid =
- TransamVariables->newestCommitTsXid = ReadNextTransactionId();
- }
- LWLockRelease(CommitTsLock);
-
- /* Create the current segment file, if necessary */
- if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno))
- {
- LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
- int slotno;
-
- LWLockAcquire(lock, LW_EXCLUSIVE);
- slotno = ZeroCommitTsPage(pageno, false);
- SimpleLruWritePage(CommitTsCtl, slotno);
- Assert(!CommitTsCtl->shared->page_dirty[slotno]);
- LWLockRelease(lock);
- }
-
- /* Change the activation status in shared memory. */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- commitTsShared->commitTsActive = true;
- LWLockRelease(CommitTsLock);
+static void ActivateCommitTs(void) {
+ TransactionId xid;
+ int64 pageno;
+
+ /* If we've done this already, there's nothing to do */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (commitTsShared->commitTsActive) {
+ LWLockRelease(CommitTsLock);
+ return;
+ }
+ LWLockRelease(CommitTsLock);
+
+ xid = XidFromFullTransactionId(TransamVariables->nextXid);
+ pageno = TransactionIdToCTsPage(xid);
+
+ /*
+ * Re-Initialize our idea of the latest page number.
+ */
+ pg_atomic_write_u64(&CommitTsCtl->shared->latest_page_number, pageno);
+
+ /*
+ * If CommitTs is enabled, but it wasn't in the previous server run, we
+ * need to set the oldest and newest values to the next Xid; that way, we
+ * will not try to read data that might not have been set.
+ *
+ * XXX does this have a problem if a server is started with commitTs
+ * enabled, then started with commitTs disabled, then restarted with it
+ * enabled again? It doesn't look like it does, because there should be a
+ * checkpoint that sets the value to InvalidTransactionId at end of
+ * recovery; and so any chance of injecting new transactions without
+ * CommitTs values would occur after the oldestCommitTsXid has been set to
+ * Invalid temporarily.
+ */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (TransamVariables->oldestCommitTsXid == InvalidTransactionId) {
+ TransamVariables->oldestCommitTsXid = TransamVariables->newestCommitTsXid =
+ ReadNextTransactionId();
+ }
+ LWLockRelease(CommitTsLock);
+
+ /* Create the current segment file, if necessary */
+ if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno)) {
+ LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
+ int slotno;
+
+ LWLockAcquire(lock, LW_EXCLUSIVE);
+ slotno = ZeroCommitTsPage(pageno, false);
+ SimpleLruWritePage(CommitTsCtl, slotno);
+ Assert(!CommitTsCtl->shared->page_dirty[slotno]);
+ LWLockRelease(lock);
+ }
+
+ /* Change the activation status in shared memory. */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ commitTsShared->commitTsActive = true;
+ LWLockRelease(CommitTsLock);
}
/*
@@ -774,57 +714,53 @@ ActivateCommitTs(void)
* Resets CommitTs into invalid state to make sure we don't hand back
* possibly-invalid data; also removes segments of old data.
*/
-static void
-DeactivateCommitTs(void)
-{
- /*
- * Cleanup the status in the shared memory.
- *
- * We reset everything in the commitTsShared record to prevent user from
- * getting confusing data about last committed transaction on the standby
- * when the module was activated repeatedly on the primary.
- */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
-
- commitTsShared->commitTsActive = false;
- commitTsShared->xidLastCommit = InvalidTransactionId;
- TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
- commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
-
- TransamVariables->oldestCommitTsXid = InvalidTransactionId;
- TransamVariables->newestCommitTsXid = InvalidTransactionId;
-
- /*
- * Remove *all* files. This is necessary so that there are no leftover
- * files; in the case where this feature is later enabled after running
- * with it disabled for some time there may be a gap in the file sequence.
- * (We can probably tolerate out-of-sequence files, as they are going to
- * be overwritten anyway when we wrap around, but it seems better to be
- * tidy.)
- *
- * Note that we do this with CommitTsLock acquired in exclusive mode. This
- * is very heavy-handed, but since this routine can only be called in the
- * replica and should happen very rarely, we don't worry too much about
- * it. Note also that no process should be consulting this SLRU if we
- * have just deactivated it.
- */
- (void) SlruScanDirectory(CommitTsCtl, SlruScanDirCbDeleteAll, NULL);
-
- LWLockRelease(CommitTsLock);
+static void DeactivateCommitTs(void) {
+ /*
+ * Cleanup the status in the shared memory.
+ *
+ * We reset everything in the commitTsShared record to prevent user from
+ * getting confusing data about last committed transaction on the standby
+ * when the module was activated repeatedly on the primary.
+ */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+
+ commitTsShared->commitTsActive = false;
+ commitTsShared->xidLastCommit = InvalidTransactionId;
+ TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
+ commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
+
+ TransamVariables->oldestCommitTsXid = InvalidTransactionId;
+ TransamVariables->newestCommitTsXid = InvalidTransactionId;
+
+ /*
+ * Remove *all* files. This is necessary so that there are no leftover
+ * files; in the case where this feature is later enabled after running
+ * with it disabled for some time there may be a gap in the file sequence.
+ * (We can probably tolerate out-of-sequence files, as they are going to
+ * be overwritten anyway when we wrap around, but it seems better to be
+ * tidy.)
+ *
+ * Note that we do this with CommitTsLock acquired in exclusive mode. This
+ * is very heavy-handed, but since this routine can only be called in the
+ * replica and should happen very rarely, we don't worry too much about
+ * it. Note also that no process should be consulting this SLRU if we
+ * have just deactivated it.
+ */
+ (void)SlruScanDirectory(CommitTsCtl, SlruScanDirCbDeleteAll, NULL);
+
+ LWLockRelease(CommitTsLock);
}
/*
* Perform a checkpoint --- either during shutdown, or on-the-fly
*/
-void
-CheckPointCommitTs(void)
-{
- /*
- * Write dirty CommitTs pages to disk. This may result in sync requests
- * queued for later handling by ProcessSyncRequests(), as part of the
- * checkpoint.
- */
- SimpleLruWriteAll(CommitTsCtl, true);
+void CheckPointCommitTs(void) {
+ /*
+ * Write dirty CommitTs pages to disk. This may result in sync requests
+ * queued for later handling by ProcessSyncRequests(), as part of the
+ * checkpoint.
+ */
+ SimpleLruWriteAll(CommitTsCtl, true);
}
/*
@@ -838,39 +774,37 @@ CheckPointCommitTs(void)
* NB: the current implementation relies on track_commit_timestamp being
* PGC_POSTMASTER.
*/
-void
-ExtendCommitTs(TransactionId newestXact)
-{
- int64 pageno;
- LWLock *lock;
-
- /*
- * Nothing to do if module not enabled. Note we do an unlocked read of
- * the flag here, which is okay because this routine is only called from
- * GetNewTransactionId, which is never called in a standby.
- */
- Assert(!InRecovery);
- if (!commitTsShared->commitTsActive)
- return;
-
- /*
- * No work except at first XID of a page. But beware: just after
- * wraparound, the first XID of page zero is FirstNormalTransactionId.
- */
- if (TransactionIdToCTsEntry(newestXact) != 0 &&
- !TransactionIdEquals(newestXact, FirstNormalTransactionId))
- return;
-
- pageno = TransactionIdToCTsPage(newestXact);
-
- lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
-
- LWLockAcquire(lock, LW_EXCLUSIVE);
-
- /* Zero the page and make an XLOG entry about it */
- ZeroCommitTsPage(pageno, !InRecovery);
-
- LWLockRelease(lock);
+void ExtendCommitTs(TransactionId newestXact) {
+ int64 pageno;
+ LWLock *lock;
+
+ /*
+ * Nothing to do if module not enabled. Note we do an unlocked read of
+ * the flag here, which is okay because this routine is only called from
+ * GetNewTransactionId, which is never called in a standby.
+ */
+ Assert(!InRecovery);
+ if (!commitTsShared->commitTsActive)
+ return;
+
+ /*
+ * No work except at first XID of a page. But beware: just after
+ * wraparound, the first XID of page zero is FirstNormalTransactionId.
+ */
+ if (TransactionIdToCTsEntry(newestXact) != 0 &&
+ !TransactionIdEquals(newestXact, FirstNormalTransactionId))
+ return;
+
+ pageno = TransactionIdToCTsPage(newestXact);
+
+ lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
+
+ LWLockAcquire(lock, LW_EXCLUSIVE);
+
+ /* Zero the page and make an XLOG entry about it */
+ ZeroCommitTsPage(pageno, !InRecovery);
+
+ LWLockRelease(lock);
}
/*
@@ -879,70 +813,59 @@ ExtendCommitTs(TransactionId newestXact)
*
* Note that we don't need to flush XLOG here.
*/
-void
-TruncateCommitTs(TransactionId oldestXact)
-{
- int64 cutoffPage;
-
- /*
- * The cutoff point is the start of the segment containing oldestXact. We
- * pass the *page* containing oldestXact to SimpleLruTruncate.
- */
- cutoffPage = TransactionIdToCTsPage(oldestXact);
-
- /* Check to see if there's any files that could be removed */
- if (!SlruScanDirectory(CommitTsCtl, SlruScanDirCbReportPresence,
- &cutoffPage))
- return; /* nothing to remove */
-
- /* Write XLOG record */
- WriteTruncateXlogRec(cutoffPage, oldestXact);
-
- /* Now we can remove the old CommitTs segment(s) */
- SimpleLruTruncate(CommitTsCtl, cutoffPage);
+void TruncateCommitTs(TransactionId oldestXact) {
+ int64 cutoffPage;
+
+ /*
+ * The cutoff point is the start of the segment containing oldestXact. We
+ * pass the *page* containing oldestXact to SimpleLruTruncate.
+ */
+ cutoffPage = TransactionIdToCTsPage(oldestXact);
+
+ /* Check to see if there's any files that could be removed */
+ if (!SlruScanDirectory(CommitTsCtl, SlruScanDirCbReportPresence, &cutoffPage))
+ return; /* nothing to remove */
+
+ /* Write XLOG record */
+ WriteTruncateXlogRec(cutoffPage, oldestXact);
+
+ /* Now we can remove the old CommitTs segment(s) */
+ SimpleLruTruncate(CommitTsCtl, cutoffPage);
}
/*
* Set the limit values between which commit TS can be consulted.
*/
-void
-SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact)
-{
- /*
- * Be careful not to overwrite values that are either further into the
- * "future" or signal a disabled committs.
- */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- if (TransamVariables->oldestCommitTsXid != InvalidTransactionId)
- {
- if (TransactionIdPrecedes(TransamVariables->oldestCommitTsXid, oldestXact))
- TransamVariables->oldestCommitTsXid = oldestXact;
- if (TransactionIdPrecedes(newestXact, TransamVariables->newestCommitTsXid))
- TransamVariables->newestCommitTsXid = newestXact;
- }
- else
- {
- Assert(TransamVariables->newestCommitTsXid == InvalidTransactionId);
- TransamVariables->oldestCommitTsXid = oldestXact;
- TransamVariables->newestCommitTsXid = newestXact;
- }
- LWLockRelease(CommitTsLock);
+void SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact) {
+ /*
+ * Be careful not to overwrite values that are either further into the
+ * "future" or signal a disabled committs.
+ */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (TransamVariables->oldestCommitTsXid != InvalidTransactionId) {
+ if (TransactionIdPrecedes(TransamVariables->oldestCommitTsXid, oldestXact))
+ TransamVariables->oldestCommitTsXid = oldestXact;
+ if (TransactionIdPrecedes(newestXact, TransamVariables->newestCommitTsXid))
+ TransamVariables->newestCommitTsXid = newestXact;
+ } else {
+ Assert(TransamVariables->newestCommitTsXid == InvalidTransactionId);
+ TransamVariables->oldestCommitTsXid = oldestXact;
+ TransamVariables->newestCommitTsXid = newestXact;
+ }
+ LWLockRelease(CommitTsLock);
}
/*
* Move forwards the oldest commitTS value that can be consulted
*/
-void
-AdvanceOldestCommitTsXid(TransactionId oldestXact)
-{
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- if (TransamVariables->oldestCommitTsXid != InvalidTransactionId &&
- TransactionIdPrecedes(TransamVariables->oldestCommitTsXid, oldestXact))
- TransamVariables->oldestCommitTsXid = oldestXact;
- LWLockRelease(CommitTsLock);
+void AdvanceOldestCommitTsXid(TransactionId oldestXact) {
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (TransamVariables->oldestCommitTsXid != InvalidTransactionId &&
+ TransactionIdPrecedes(TransamVariables->oldestCommitTsXid, oldestXact))
+ TransamVariables->oldestCommitTsXid = oldestXact;
+ LWLockRelease(CommitTsLock);
}
-
/*
* Decide whether a commitTS page number is "older" for truncation purposes.
* Analogous to CLOGPagePrecedes().
@@ -966,101 +889,87 @@ AdvanceOldestCommitTsXid(TransactionId oldestXact)
* last entry of the oldestXact page. While page 2 is expendable at
* oldestXact=N+2.1, it would be precious at oldestXact=N+2.9.
*/
-static bool
-CommitTsPagePrecedes(int64 page1, int64 page2)
-{
- TransactionId xid1;
- TransactionId xid2;
-
- xid1 = ((TransactionId) page1) * COMMIT_TS_XACTS_PER_PAGE;
- xid1 += FirstNormalTransactionId + 1;
- xid2 = ((TransactionId) page2) * COMMIT_TS_XACTS_PER_PAGE;
- xid2 += FirstNormalTransactionId + 1;
-
- return (TransactionIdPrecedes(xid1, xid2) &&
- TransactionIdPrecedes(xid1, xid2 + COMMIT_TS_XACTS_PER_PAGE - 1));
-}
+static bool CommitTsPagePrecedes(int64 page1, int64 page2) {
+ TransactionId xid1;
+ TransactionId xid2;
+ xid1 = ((TransactionId)page1) * COMMIT_TS_XACTS_PER_PAGE;
+ xid1 += FirstNormalTransactionId + 1;
+ xid2 = ((TransactionId)page2) * COMMIT_TS_XACTS_PER_PAGE;
+ xid2 += FirstNormalTransactionId + 1;
+
+ return (TransactionIdPrecedes(xid1, xid2) &&
+ TransactionIdPrecedes(xid1, xid2 + COMMIT_TS_XACTS_PER_PAGE - 1));
+}
/*
* Write a ZEROPAGE xlog record
*/
-static void
-WriteZeroPageXlogRec(int64 pageno)
-{
- XLogBeginInsert();
- XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno) {
+ XLogBeginInsert();
+ XLogRegisterData((char *)(&pageno), sizeof(pageno));
+ return XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
}
/*
* Write a TRUNCATE xlog record
*/
-static void
-WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid)
-{
- xl_commit_ts_truncate xlrec;
+static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid) {
+ xl_commit_ts_truncate xlrec;
- xlrec.pageno = pageno;
- xlrec.oldestXid = oldestXid;
+ xlrec.pageno = pageno;
+ xlrec.oldestXid = oldestXid;
- XLogBeginInsert();
- XLogRegisterData((char *) (&xlrec), SizeOfCommitTsTruncate);
- (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_TRUNCATE);
+ XLogBeginInsert();
+ XLogRegisterData((char *)(&xlrec), SizeOfCommitTsTruncate);
+ (void)XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_TRUNCATE);
}
/*
* CommitTS resource manager's routines
*/
-void
-commit_ts_redo(XLogReaderState *record)
-{
- uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
-
- /* Backup blocks are not used in commit_ts records */
- Assert(!XLogRecHasAnyBlockRefs(record));
-
- if (info == COMMIT_TS_ZEROPAGE)
- {
- int64 pageno;
- int slotno;
- LWLock *lock;
-
- memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
-
- lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
- LWLockAcquire(lock, LW_EXCLUSIVE);
-
- slotno = ZeroCommitTsPage(pageno, false);
- SimpleLruWritePage(CommitTsCtl, slotno);
- Assert(!CommitTsCtl->shared->page_dirty[slotno]);
-
- LWLockRelease(lock);
- }
- else if (info == COMMIT_TS_TRUNCATE)
- {
- xl_commit_ts_truncate *trunc = (xl_commit_ts_truncate *) XLogRecGetData(record);
-
- AdvanceOldestCommitTsXid(trunc->oldestXid);
-
- /*
- * During XLOG replay, latest_page_number isn't set up yet; insert a
- * suitable value to bypass the sanity test in SimpleLruTruncate.
- */
- pg_atomic_write_u64(&CommitTsCtl->shared->latest_page_number,
- trunc->pageno);
-
- SimpleLruTruncate(CommitTsCtl, trunc->pageno);
- }
- else
- elog(PANIC, "commit_ts_redo: unknown op code %u", info);
+void commit_ts_redo(XLogReaderState *record) {
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+ /* Backup blocks are not used in commit_ts records */
+ Assert(!XLogRecHasAnyBlockRefs(record));
+
+ if (info == COMMIT_TS_ZEROPAGE) {
+ int64 pageno;
+ int slotno;
+ LWLock *lock;
+
+ memcpy(&pageno, XLogRecGetData(record), sizeof(pageno));
+
+ lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
+ LWLockAcquire(lock, LW_EXCLUSIVE);
+
+ slotno = ZeroCommitTsPage(pageno, false);
+ SimpleLruWritePage(CommitTsCtl, slotno);
+ Assert(!CommitTsCtl->shared->page_dirty[slotno]);
+
+ LWLockRelease(lock);
+ } else if (info == COMMIT_TS_TRUNCATE) {
+ xl_commit_ts_truncate *trunc =
+ (xl_commit_ts_truncate *)XLogRecGetData(record);
+
+ AdvanceOldestCommitTsXid(trunc->oldestXid);
+
+ /*
+ * During XLOG replay, latest_page_number isn't set up yet; insert a
+ * suitable value to bypass the sanity test in SimpleLruTruncate.
+ */
+ pg_atomic_write_u64(&CommitTsCtl->shared->latest_page_number,
+ trunc->pageno);
+
+ SimpleLruTruncate(CommitTsCtl, trunc->pageno);
+ } else
+ elog(PANIC, "commit_ts_redo: unknown op code %u", info);
}
/*
* Entrypoint for sync.c to sync commit_ts files.
*/
-int
-committssyncfiletag(const FileTag *ftag, char *path)
-{
- return SlruSyncFileTag(CommitTsCtl, ftag, path);
+int committssyncfiletag(const FileTag *ftag, char *path) {
+ return SlruSyncFileTag(CommitTsCtl, ftag, path);
}
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 8c37d7eba7..b11411c2e1 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -84,6 +84,7 @@
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/autovacuum.h"
+#include "storage/bufpage.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "storage/procarray.h"
@@ -107,7 +108,7 @@
*/
/* We need four bytes per offset */
-#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+#define MULTIXACT_OFFSETS_PER_PAGE (SizeOfPageContents / sizeof(MultiXactOffset))
static inline int64
MultiXactIdToOffsetPage(MultiXactId multi)
@@ -132,8 +133,8 @@ MultiXactIdToOffsetSegment(MultiXactId multi)
* additional flag bits for each TransactionId. To do this without getting
* into alignment issues, we store four bytes of flags, and then the
* corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
- * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
- * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 408 groups
+ * per page. This wastes 8 bytes per page, but that's OK -- simplicity (and
* performance) trumps space efficiency here.
*
* Note that the "offset" macros work with byte offset, not array indexes, so
@@ -151,7 +152,7 @@ MultiXactIdToOffsetSegment(MultiXactId multi)
/* size in bytes of a complete group */
#define MULTIXACT_MEMBERGROUP_SIZE \
(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
-#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (SizeOfPageContents / MULTIXACT_MEMBERGROUP_SIZE)
#define MULTIXACT_MEMBERS_PER_PAGE \
(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
@@ -413,7 +414,7 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
MultiXactOffset start, uint32 distance);
static bool SetOffsetVacuumLimit(bool is_startup);
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
-static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
+static XLogRecPtr WriteMZeroPageXlogRec(int64 pageno, uint8 info);
static void WriteMTruncateXlogRec(Oid oldestMultiDB,
MultiXactId startTruncOff,
MultiXactId endTruncOff,
@@ -939,7 +940,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
* take the trouble to generalize the slru.c error reporting code.
*/
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
*offptr = offset;
@@ -994,12 +995,12 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
}
memberptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
*memberptr = members[i].xid;
flagsptr = (uint32 *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
flagsval = *flagsptr;
flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
@@ -1427,7 +1428,7 @@ retry:
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
@@ -1476,7 +1477,7 @@ retry:
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
}
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
nextMXOffset = *offptr;
@@ -1544,7 +1545,7 @@ retry:
}
xactptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
if (!TransactionIdIsValid(*xactptr))
{
@@ -1555,7 +1556,7 @@ retry:
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
- flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ flagsptr = (uint32 *) (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
ptr[truelength].xid = *xactptr;
ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
@@ -2074,11 +2075,17 @@ static int
ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+ page = MultiXactOffsetCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2090,11 +2097,17 @@ static int
ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
+ page = MultiXactMemberCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2218,10 +2231,10 @@ TrimMultiXact(void)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
- MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+ MemSet(offptr, 0, SizeOfPageContents - (entryno * sizeof(MultiXactOffset)));
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
LWLockRelease(lock);
@@ -2252,9 +2265,9 @@ TrimMultiXact(void)
memberoff = MXOffsetToMemberOffset(offset);
slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
xidptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
- MemSet(xidptr, 0, BLCKSZ - memberoff);
+ MemSet(xidptr, 0, SizeOfPageContents - memberoff);
/*
* Note: we don't need to zero out the flag bits in the remaining
@@ -2909,7 +2922,7 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno));
@@ -3351,12 +3364,12 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
* Write an xlog record reflecting the zeroing of either a MEMBERs or
* OFFSETs page (info shows which)
*/
-static void
+static XLogRecPtr
WriteMZeroPageXlogRec(int64 pageno, uint8 info)
{
XLogBeginInsert();
XLogRegisterData((char *) (&pageno), sizeof(pageno));
- (void) XLogInsert(RM_MULTIXACT_ID, info);
+ return XLogInsert(RM_MULTIXACT_ID, info);
}
/*
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index f249c3cd05..7ede848299 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -68,6 +68,7 @@
#include "access/xlogutils.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/shmem.h"
#include "utils/guc.h"
@@ -170,6 +171,7 @@ typedef enum
SLRU_WRITE_FAILED,
SLRU_FSYNC_FAILED,
SLRU_CLOSE_FAILED,
+ SLRU_DATA_CORRUPTED,
} SlruErrorCause;
static SlruErrorCause slru_errcause;
@@ -392,8 +394,8 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
shared->page_dirty[slotno] = true;
SlruRecentlyUsed(shared, slotno);
- /* Set the buffer to zeroes */
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ /* Initialize the page. */
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
/* Set the LSNs for this new page to zero */
SimpleLruZeroLSNs(ctl, slotno);
@@ -832,7 +834,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
ereport(LOG,
(errmsg("file \"%s\" doesn't exist, reading as zeroes",
path)));
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
return true;
}
@@ -855,6 +857,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
return false;
}
+ if (!PageIsVerifiedExtended(shared->page_buffer[slotno], pageno, PIV_REPORT_STAT))
+ {
+ slru_errcause = SLRU_DATA_CORRUPTED;
+ slru_errno = 0;
+ return false;
+ }
+
return true;
}
@@ -881,6 +890,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd = -1;
+ Page page = shared->page_buffer[slotno];
+ XLogRecPtr lsn;
/* update the stats counter of written pages */
pgstat_count_slru_page_written(shared->slru_stats_idx);
@@ -889,41 +900,21 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
* Honor the write-WAL-before-data rule, if appropriate, so that we do not
* write out data before associated WAL records. This is the same action
* performed during FlushBuffer() in the main buffer manager.
+ *
+ * The largest async-commit LSN for the page is maintained through page LSN.
*/
- if (shared->group_lsn != NULL)
+ lsn = PageGetLSN(page);
+ if (!XLogRecPtrIsInvalid(lsn))
{
/*
- * We must determine the largest async-commit LSN for the page. This
- * is a bit tedious, but since this entire function is a slow path
- * anyway, it seems better to do this here than to maintain a per-page
- * LSN variable (which'd need an extra comparison in the
- * transaction-commit path).
+ * As noted above, elog(ERROR) is not acceptable here, so if
+ * XLogFlush were to fail, we must PANIC. This isn't much of a
+ * restriction because XLogFlush is just about all critical
+ * section anyway, but let's make sure.
*/
- XLogRecPtr max_lsn;
- int lsnindex;
-
- lsnindex = slotno * shared->lsn_groups_per_page;
- max_lsn = shared->group_lsn[lsnindex++];
- for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
- {
- XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
-
- if (max_lsn < this_lsn)
- max_lsn = this_lsn;
- }
-
- if (!XLogRecPtrIsInvalid(max_lsn))
- {
- /*
- * As noted above, elog(ERROR) is not acceptable here, so if
- * XLogFlush were to fail, we must PANIC. This isn't much of a
- * restriction because XLogFlush is just about all critical
- * section anyway, but let's make sure.
- */
- START_CRIT_SECTION();
- XLogFlush(max_lsn);
- END_CRIT_SECTION();
- }
+ START_CRIT_SECTION();
+ XLogFlush(lsn);
+ END_CRIT_SECTION();
}
/*
@@ -988,6 +979,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
}
}
+ PageSetChecksumInplace(shared->page_buffer[slotno], pageno);
+
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
@@ -1108,6 +1101,13 @@ SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
errdetail("Could not close file \"%s\": %m.",
path)));
break;
+ case SLRU_DATA_CORRUPTED:
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Invalid page from file \"%s\" at offset %d.",
+ path, offset)));
+ break;
default:
/* can't get here, we trust */
elog(ERROR, "unrecognized SimpleLru error cause: %d",
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 50bb1d8cfc..869ae7a25d 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -34,6 +34,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "utils/guc_hooks.h"
+#include "storage/bufpage.h"
#include "utils/snapmgr.h"
@@ -51,7 +52,7 @@
*/
/* We need four bytes per xact */
-#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+#define SUBTRANS_XACTS_PER_PAGE (SizeOfPageContents / sizeof(TransactionId))
/*
* Although we return an int64 the actual value can't currently exceed
@@ -97,7 +98,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
/*
@@ -137,7 +138,7 @@ SubTransGetParent(TransactionId xid)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
parent = *ptr;
@@ -366,7 +367,6 @@ CheckPointSUBTRANS(void)
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that SUBTRANS has room for a newly-allocated XID.
*
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 8ed503e1c1..ae24108910 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -140,6 +140,7 @@
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/procsignal.h"
@@ -160,7 +161,7 @@
* than that, so changes in that data structure won't affect user-visible
* restrictions.
*/
-#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128)
+#define NOTIFY_PAYLOAD_MAX_LENGTH (SizeOfPageContents - NAMEDATALEN - 128)
/*
* Struct representing an entry in the global notify queue
@@ -309,7 +310,7 @@ static SlruCtlData NotifyCtlData;
#define NotifyCtl (&NotifyCtlData)
#define QUEUE_PAGESIZE BLCKSZ
-
+#define QUEUE_PAGE_CAPACITY (QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData))
#define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */
/*
@@ -1295,14 +1296,14 @@ asyncQueueAdvance(volatile QueuePosition *position, int entryLength)
* written or read.
*/
offset += entryLength;
- Assert(offset <= QUEUE_PAGESIZE);
+ Assert(offset <= QUEUE_PAGE_CAPACITY);
/*
* In a second step check if another entry can possibly be written to the
* page. If so, stay here, we have reached the next position. If not, then
* we need to move on to the next page.
*/
- if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGESIZE)
+ if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGE_CAPACITY)
{
pageno++;
offset = 0;
@@ -1405,7 +1406,7 @@ asyncQueueAddEntries(ListCell *nextNotify)
offset = QUEUE_POS_OFFSET(queue_head);
/* Check whether the entry really fits on the current page */
- if (offset + qe.length <= QUEUE_PAGESIZE)
+ if (offset + qe.length <= QUEUE_PAGE_CAPACITY)
{
/* OK, so advance nextNotify past this item */
nextNotify = lnext(pendingNotifies->events, nextNotify);
@@ -1417,14 +1418,14 @@ asyncQueueAddEntries(ListCell *nextNotify)
* only check dboid and since it won't match any reader's database
* OID, they will ignore this entry and move on.
*/
- qe.length = QUEUE_PAGESIZE - offset;
+ qe.length = QUEUE_PAGE_CAPACITY - offset;
qe.dboid = InvalidOid;
qe.data[0] = '\0'; /* empty channel */
qe.data[1] = '\0'; /* empty payload */
}
/* Now copy qe into the shared buffer page */
- memcpy(NotifyCtl->shared->page_buffer[slotno] + offset,
+ memcpy(PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + offset,
&qe,
qe.length);
@@ -1955,10 +1956,10 @@ asyncQueueReadAllNotifications(void)
else
{
/* fetch all the rest of the page */
- copysize = QUEUE_PAGESIZE - curoffset;
+ copysize = QUEUE_PAGE_CAPACITY - curoffset;
}
- memcpy(page_buffer.buf + curoffset,
- NotifyCtl->shared->page_buffer[slotno] + curoffset,
+ memcpy(PageGetContents(page_buffer.buf) + curoffset,
+ PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + curoffset,
copysize);
/* Release lock that we got from SimpleLruReadPage_ReadOnly() */
LWLockRelease(SimpleLruGetBankLock(NotifyCtl, curpage));
@@ -2029,7 +2030,7 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current,
if (QUEUE_POS_EQUAL(thisentry, stop))
break;
- qe = (AsyncQueueEntry *) (page_buffer + QUEUE_POS_OFFSET(thisentry));
+ qe = (AsyncQueueEntry *) (PageGetContents(page_buffer) + QUEUE_POS_OFFSET(thisentry));
/*
* Advance *current over this message, possibly to the next page. As
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 2030322f95..35b326e988 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -207,6 +207,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "port/pg_lfind.h"
+#include "storage/bufpage.h"
#include "storage/predicate.h"
#include "storage/predicate_internals.h"
#include "storage/proc.h"
@@ -326,8 +327,8 @@ static SlruCtlData SerialSlruCtlData;
#define SerialSlruCtl (&SerialSlruCtlData)
#define SERIAL_PAGESIZE BLCKSZ
-#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
-#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE - MAXALIGN(SizeOfPageHeaderData) / SERIAL_ENTRYSIZE)
/*
* Set maximum pages based on the number needed to track all transactions.
@@ -337,7 +338,7 @@ static SlruCtlData SerialSlruCtlData;
#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
- (SerialSlruCtl->shared->page_buffer[slotno] + \
+ (PageGetContents(SerialSlruCtl->shared->page_buffer[slotno]) + \
((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
@@ -789,10 +790,13 @@ SerialPagePrecedesLogicallyUnitTests(void)
* requires burning ~2B XIDs in single-user mode, a negligible
* possibility. Moreover, if it does happen, the consequence would be
* mild, namely a new transaction failing in SimpleLruReadPage().
+ *
+ * NOTE: After adding the page header, the defect affects two pages.
+ * We now assert correct treatment of its second to prior page.
*/
headPage = oldestPage;
targetPage = newestPage;
- Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+ Assert(SerialPagePrecedesLogically(headPage, targetPage - 2));
#if 0
Assert(SerialPagePrecedesLogically(headPage, targetPage));
#endif
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index be6f1f62d2..e8193d7f56 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -59,6 +59,31 @@ PageInit(Page page, Size pageSize, Size specialSize)
/* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
}
+/*
+ * PageInitSLRU
+ * Initializes the contents of an SLRU page.
+ * Note that we don't calculate an initial checksum here; that's not done
+ * until it's time to write.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
/*
* PageIsVerifiedExtended
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index b86bc417c9..f186798094 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -16,6 +16,7 @@
#include <dirent.h>
#include <limits.h>
+#include <stdbool.h>
#include <sys/stat.h>
#include <time.h>
#include <unistd.h>
@@ -592,12 +593,20 @@ main(int argc, char *argv[])
{
total_size = scan_directory(DataDir, "global", true);
total_size += scan_directory(DataDir, "base", true);
+ total_size += scan_directory(DataDir, "pg_commit_ts", true);
+ total_size += scan_directory(DataDir, "pg_multixact", true);
+ total_size += scan_directory(DataDir, "pg_serial", true);
total_size += scan_directory(DataDir, PG_TBLSPC_DIR, true);
+ total_size += scan_directory(DataDir, "pg_xact", true);
}
(void) scan_directory(DataDir, "global", false);
(void) scan_directory(DataDir, "base", false);
+ (void) scan_directory(DataDir, "pg_commit_ts", false);
+ (void) scan_directory(DataDir, "pg_multixact", false);
+ (void) scan_directory(DataDir, "pg_serial", false);
(void) scan_directory(DataDir, PG_TBLSPC_DIR, false);
+ (void) scan_directory(DataDir, "pg_xact", false);
if (showprogress)
progress_report(true);
diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl
index 9829e48106..7b9e034e19 100644
--- a/src/bin/pg_resetwal/t/001_basic.pl
+++ b/src/bin/pg_resetwal/t/001_basic.pl
@@ -206,7 +206,7 @@ push @cmd,
sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1]));
@files = get_slru_files('pg_multixact/offsets');
-$mult = 32 * $blcksz / 4;
+$mult = 32 * ($blcksz - 24) / 4;
# -m argument is "new,old"
push @cmd, '-m',
sprintf("%d,%d",
@@ -214,11 +214,11 @@ push @cmd, '-m',
hex($files[0]) == 0 ? 1 : hex($files[0] * $mult));
@files = get_slru_files('pg_multixact/members');
-$mult = 32 * int($blcksz / 20) * 4;
+$mult = 32 * int(($blcksz - 24) / 20) * 4;
push @cmd, '-O', (hex($files[-1]) + 1) * $mult;
@files = get_slru_files('pg_xact');
-$mult = 32 * $blcksz * 4;
+$mult = 32 * ($blcksz - 24) * 4;
push @cmd,
'-u', (hex($files[0]) == 0 ? 3 : hex($files[0]) * $mult),
'-x', ((hex($files[-1]) + 1) * $mult);
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index 73932504ca..ecff88ee98 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -9,6 +9,7 @@
#include "postgres_fe.h"
+#include <dirent.h>
#include <sys/stat.h>
#include <limits.h>
#include <fcntl.h>
@@ -449,3 +450,180 @@ check_hard_link(void)
unlink(new_link_file);
}
+
+
+/*
+ * Copy SLRU_PAGES_PER_SEGMENT from access/slru.h to avoid including it.
+ */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+#define SEGMENT_SIZE (BLCKSZ * SLRU_PAGES_PER_SEGMENT)
+
+/*
+ * Copy PageInitSLRU from storage/bufpage.c to avoid linking to the backend.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
+
+/*
+ * Filter function for scandir(3) to select only segment files.
+ */
+static int
+segment_file_filter(const struct dirent *dirent)
+{
+ return strspn(dirent->d_name, "0123456789ABCDEF") == strlen(dirent->d_name);
+}
+
+/*
+ * Upgrade a single clog segment to add a page header on each page.
+ */
+static void
+upgrade_file(const char *src_dir, const char *src_file, const char *dst_dir)
+{
+ char src[MAXPGPATH];
+ char dst[MAXPGPATH];
+
+ int seg_name_len;
+ int src_segno;
+ int64 src_pageno;
+ int dst_segno;
+ int64 dst_pageno;
+ int dst_offset;
+
+ int src_fd;
+ int dst_fd;
+
+ char *src_buf;
+ ssize_t src_len;
+ ssize_t src_buf_offset;
+ PGAlignedBlock dst_block;
+ Page page = dst_block.data;
+ int len_to_copy;
+
+ seg_name_len = strlen(src_file);
+ src_segno = (int) strtol(src_file, NULL, 16);
+ src_pageno = src_segno * SLRU_PAGES_PER_SEGMENT;
+
+ dst_pageno = src_pageno * BLCKSZ / SizeOfPageContents;
+ dst_offset = src_pageno * BLCKSZ - dst_pageno * SizeOfPageContents;
+ dst_segno = dst_pageno / SLRU_PAGES_PER_SEGMENT;
+
+ snprintf(src, sizeof(src), "%s/%s", src_dir, src_file);
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+
+ src_buf = pg_malloc(SEGMENT_SIZE);
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) == -1)
+ pg_fatal("could not open file \"%s\": %s", src, strerror(errno));
+ if ((src_len = read(src_fd, src_buf, SEGMENT_SIZE)) == -1)
+ pg_fatal("could not read file \"%s\": %s", src, strerror(errno));
+
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+
+ /*
+ * Read the destination page at dst_pageno into the buffer. The page may contain
+ * data from the previous source segment. Initialize the page if the page is new.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+ if (read(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not read file \"%s\": %s", dst, strerror(errno));
+ if (PageIsNew(page))
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Rewind the file position, so the first write will overwrite the page.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+
+ src_buf_offset = 0;
+ while (src_buf_offset < src_len)
+ {
+ len_to_copy = Min(src_len - src_buf_offset, SizeOfPageContents - dst_offset);
+ memcpy(PageGetContents(page) + dst_offset, src_buf + src_buf_offset, len_to_copy);
+ src_buf_offset += len_to_copy;
+
+ if (new_cluster.controldata.data_checksum_version > 0)
+ ((PageHeader) page)->pd_checksum = pg_checksum_page(page, dst_pageno);
+ if (write(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not write file \"%s\": %s", dst, strerror(errno));
+
+ dst_pageno++;
+ dst_offset = 0;
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Switch segments if we reached the end of the current segment.
+ */
+ if (dst_pageno % SLRU_PAGES_PER_SEGMENT == 0)
+ {
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ dst_segno++;
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+ }
+ }
+
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ pg_free(src_buf);
+ close(src_fd);
+}
+
+/*
+ * Upgrade the clog files to add a page header to each SLRU page.
+ */
+void
+upgrade_xact_cache(const char *src_subdir, const char *dst_subdir)
+{
+ char src_dir[MAXPGPATH];
+ char dst_dir[MAXPGPATH];
+
+ DIR *src_dirp;
+ struct dirent *src_dirent;
+
+ snprintf(src_dir, sizeof(src_dir), "%s/%s", old_cluster.pgdata, src_subdir);
+ snprintf(dst_dir, sizeof(dst_dir), "%s/%s", new_cluster.pgdata, dst_subdir);
+
+ if ((src_dirp = opendir(src_dir)) == NULL)
+ pg_fatal("could not open directory \"%s\": %s", src_dir, strerror(errno));
+
+ while (errno = 0, (src_dirent = readdir(src_dirp)) != NULL)
+ {
+ if (segment_file_filter(src_dirent))
+ upgrade_file(src_dir, src_dirent->d_name, dst_dir);
+ }
+
+ if (closedir(src_dirp) != 0)
+ pg_fatal("could not close directory \"%s\": %s", src_dir, strerror(errno));
+}
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 663235816f..7d6d22bb51 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -706,14 +706,28 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir)
static void
copy_xact_xlog_xid(void)
{
+ bool slru_header_changed = false;
+
/*
* Copy old commit logs to new data dir. pg_clog has been renamed to
* pg_xact in post-10 clusters.
*/
- copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact",
- GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact");
+ char *xact_old_dir = GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+ char *xact_new_dir = GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+
+ /*
+ * In post-17 clusters, a page header is added to each SLRU page.
+ * Perform a one-time conversion of the clog files if the old
+ * cluster and the new cluster use different SLRU formats.
+ */
+ if (new_cluster.controldata.cat_ver >= SLRU_PAGE_HEADER_CAT_VER &&
+ old_cluster.controldata.cat_ver < SLRU_PAGE_HEADER_CAT_VER)
+ slru_header_changed = true;
+
+ if (slru_header_changed)
+ upgrade_xact_cache(xact_old_dir, xact_new_dir);
+ else
+ copy_subdir_files(xact_old_dir, xact_new_dir);
prep_status("Setting oldest XID for new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
@@ -748,7 +762,8 @@ copy_xact_xlog_xid(void)
* server doesn't attempt to read multis older than the cutoff value.
*/
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
- new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
+ !slru_header_changed)
{
copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
copy_subdir_files("pg_multixact/members", "pg_multixact/members");
@@ -768,7 +783,8 @@ copy_xact_xlog_xid(void)
new_cluster.pgdata);
check_ok();
}
- else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER ||
+ slru_header_changed)
{
/*
* Remove offsets/0000 file created by initdb that no longer matches
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 53f693c2d4..f52fb399ce 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -114,6 +114,11 @@ extern char *output_files[];
*/
#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
+/*
+ * A page header was added to each SLRU page in 17.0.
+ */
+#define SLRU_PAGE_HEADER_CAT_VER 202403111
+
/*
* large object chunk size added to pg_controldata,
* commit 5f93c37805e7485488480916b4585e098d3cc883
@@ -414,6 +419,7 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile,
void check_file_clone(void);
void check_copy_file_range(void);
void check_hard_link(void);
+void upgrade_xact_cache(const char *src_subdir, const char *dst_subdir);
/* fopen_priv() is no longer different from fopen() */
#define fopen_priv(path, mode) fopen(path, mode)
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 6222d46e53..f6662ee226 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -204,6 +204,7 @@ typedef PageHeaderData *PageHeader;
* handling pages.
*/
#define PG_PAGE_LAYOUT_VERSION 4
+#define PG_SLRU_PAGE_LAYOUT_VERSION 1
#define PG_DATA_CHECKSUM_VERSION 1
/* ----------------------------------------------------------------
@@ -260,6 +261,11 @@ PageGetContents(Page page)
return (char *) page + MAXALIGN(SizeOfPageHeaderData);
}
+/*
+ * Space available for storing page contents.
+ */
+#define SizeOfPageContents (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
/* ----------------
* functions to access page size info
* ----------------
@@ -489,6 +495,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
"BLCKSZ has to be a multiple of sizeof(size_t)");
extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern void PageInitSLRU(Page page, Size pageSize, Size specialSize);
extern bool PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags);
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
OffsetNumber offsetNumber, int flags);
diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c
index d227b06703..f87d3e4993 100644
--- a/src/test/modules/test_slru/test_slru.c
+++ b/src/test/modules/test_slru/test_slru.c
@@ -17,6 +17,7 @@
#include "access/slru.h"
#include "access/transam.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
@@ -72,8 +73,8 @@ test_slru_page_write(PG_FUNCTION_ARGS)
TestSlruCtl->shared->page_status[slotno] = SLRU_PAGE_VALID;
/* write given data to the page, up to the limit of the page */
- strncpy(TestSlruCtl->shared->page_buffer[slotno], data,
- BLCKSZ - 1);
+ strncpy(PageGetContents(TestSlruCtl->shared->page_buffer[slotno]), data,
+ SizeOfPageContents - 1);
SimpleLruWritePage(TestSlruCtl, slotno);
LWLockRelease(lock);
@@ -101,7 +102,7 @@ test_slru_page_read(PG_FUNCTION_ARGS)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(TestSlruCtl, pageno,
write_ok, InvalidTransactionId);
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(lock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -120,7 +121,7 @@ test_slru_page_readonly(PG_FUNCTION_ARGS)
pageno,
InvalidTransactionId);
Assert(LWLockHeldByMe(lock));
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(lock);
PG_RETURN_TEXT_P(cstring_to_text(data));
--
2.34.1
Thanks for taking interest in this proposal. There is no remaining
work for this proposal. It’s now “waiting for review”. It would be
great if you can provide a review report, so we can change the status
to “ready for commit”.
I’ve updated the patch against the latest HEAD.
Hi all,
Revisiting this patch. I’ve rebased it against the latest HEAD, and
updated the SLRU_PAGE_HEADER_CAT_VER and CATALOG_VERSION_NO. All
tests pass.
Attachments:
0001-Add-page-headers-to-SLRU-pages.patchapplication/octet-stream; name=0001-Add-page-headers-to-SLRU-pages.patchDownload
From 35e76b7255b0d621a2c55fadfa3996dccbf7ff17 Mon Sep 17 00:00:00 2001
From: Rishu Bagga <rishu@rishus-air.lan>
Date: Thu, 26 Jun 2025 19:03:10 -0700
Subject: [PATCH] Add page headers to SLRU pages
---
src/backend/access/transam/clog.c | 45 +-
src/backend/access/transam/commit_ts.c | 1139 +++++++++++-------------
src/backend/access/transam/multixact.c | 55 +-
src/backend/access/transam/slru.c | 71 +-
src/backend/access/transam/subtrans.c | 8 +-
src/backend/commands/async.c | 23 +-
src/backend/storage/lmgr/predicate.c | 12 +-
src/backend/storage/page/bufpage.c | 26 +
src/bin/pg_checksums/pg_checksums.c | 9 +
src/bin/pg_resetwal/t/001_basic.pl | 6 +-
src/bin/pg_upgrade/file.c | 178 ++++
src/bin/pg_upgrade/pg_upgrade.c | 28 +-
src/bin/pg_upgrade/pg_upgrade.h | 6 +
src/include/catalog/catversion.h | 2 +-
src/include/storage/bufpage.h | 7 +
src/test/modules/test_slru/test_slru.c | 9 +-
16 files changed, 915 insertions(+), 709 deletions(-)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 48f10bec91e..9b4ba0861a9 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -42,6 +42,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/proc.h"
#include "storage/sync.h"
#include "utils/guc_hooks.h"
@@ -61,7 +62,7 @@
/* We need two bits per xact, so four xacts fit in a byte */
#define CLOG_BITS_PER_XACT 2
#define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACTS_PER_PAGE (SizeOfPageContents * CLOG_XACTS_PER_BYTE)
#define CLOG_XACT_BITMASK ((1 << CLOG_BITS_PER_XACT) - 1)
/*
@@ -90,7 +91,13 @@ TransactionIdToPage(TransactionId xid)
/* We store the latest async LSN for each group of transactions */
#define CLOG_XACTS_PER_LSN_GROUP 32 /* keep this a power of 2 */
-#define CLOG_LSNS_PER_PAGE (CLOG_XACTS_PER_PAGE / CLOG_XACTS_PER_LSN_GROUP)
+
+/*
+ * Use BLCKSZ instead of SizeOfPageContents so that CLOG_LSNS_PER_PAGE is
+ * a power of 2. Using BLCKSZ wastes the last 4 LSN groups per page, but
+ * this is acceptable given that each page has 1,024 LSN groups.
+ */
+#define CLOG_LSNS_PER_PAGE ((BLCKSZ * CLOG_XACTS_PER_BYTE) / CLOG_XACTS_PER_LSN_GROUP)
#define GetLSNIndex(slotno, xid) ((slotno) * CLOG_LSNS_PER_PAGE + \
((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP)
@@ -112,7 +119,7 @@ static SlruCtlData XactCtlData;
static int ZeroCLOGPage(int64 pageno, bool writeXlog);
static bool CLOGPagePrecedes(int64 page1, int64 page2);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXact,
Oid oldestXactDb);
static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
@@ -665,13 +672,15 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
char *byteptr;
char byteval;
char curval;
+ Page page;
Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(xid));
Assert(LWLockHeldByMeInMode(SimpleLruGetBankLock(XactCtl,
XactCtl->shared->page_number[slotno]),
LW_EXCLUSIVE));
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
/*
@@ -700,7 +709,7 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*byteptr = byteval;
/*
- * Update the group LSN if the transaction completion LSN is higher.
+ * Update the page & group LSN if the transaction completion LSN is higher.
*
* Note: lsn will be invalid when supplied during InRecovery processing,
* so we don't need to do anything special to avoid LSN updates during
@@ -709,10 +718,13 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i
*/
if (!XLogRecPtrIsInvalid(lsn))
{
- int lsnindex = GetLSNIndex(slotno, xid);
+ int lsnindex = GetLSNIndex(slotno, xid);
if (XactCtl->shared->group_lsn[lsnindex] < lsn)
XactCtl->shared->group_lsn[lsnindex] = lsn;
+
+ if (PageGetLSN(page) < lsn)
+ PageSetLSN(page, lsn);
}
}
@@ -739,13 +751,15 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
int slotno;
int lsnindex;
+ Page page;
char *byteptr;
XidStatus status;
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(XactCtl, pageno, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ page = XactCtl->shared->page_buffer[slotno];
+ byteptr = PageGetContents(page) + byteno;
status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
@@ -860,11 +874,17 @@ static int
ZeroCLOGPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(XactCtl, pageno);
+ page = XactCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -917,12 +937,12 @@ TrimCLOG(void)
char *byteptr;
slotno = SimpleLruReadPage(XactCtl, pageno, false, xid);
- byteptr = XactCtl->shared->page_buffer[slotno] + byteno;
+ byteptr = PageGetContents(XactCtl->shared->page_buffer[slotno]) + byteno;
/* Zero so-far-unused positions in the current byte */
*byteptr &= (1 << bshift) - 1;
/* Zero the rest of the page */
- MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
+ MemSet(byteptr + 1, 0, SizeOfPageContents - byteno - 1);
XactCtl->shared->page_dirty[slotno] = true;
}
@@ -946,7 +966,6 @@ CheckPointCLOG(void)
TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that CLOG has room for a newly-allocated XID.
*
@@ -1070,12 +1089,12 @@ CLOGPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData(&pageno, sizeof(pageno));
- (void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
+ return XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 113fae1437a..4fe73e3be19 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -29,6 +29,7 @@
#include "access/xlogutils.h"
#include "funcapi.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/shmem.h"
#include "utils/fmgrprotos.h"
#include "utils/guc_hooks.h"
@@ -51,31 +52,27 @@
* the largest possible file name is more than 5 chars long; see
* SlruScanDirectory.
*/
-typedef struct CommitTimestampEntry
-{
- TimestampTz time;
- RepOriginId nodeid;
+typedef struct CommitTimestampEntry {
+ TimestampTz time;
+ RepOriginId nodeid;
} CommitTimestampEntry;
-#define SizeOfCommitTimestampEntry (offsetof(CommitTimestampEntry, nodeid) + \
- sizeof(RepOriginId))
-
-#define COMMIT_TS_XACTS_PER_PAGE \
- (BLCKSZ / SizeOfCommitTimestampEntry)
+#define SizeOfCommitTimestampEntry \
+ (offsetof(CommitTimestampEntry, nodeid) + sizeof(RepOriginId))
+#define COMMIT_TS_XACTS_PER_PAGE \
+ (SizeOfPageContents / SizeOfCommitTimestampEntry)
/*
* Although we return an int64 the actual value can't currently exceed
* 0xFFFFFFFF/COMMIT_TS_XACTS_PER_PAGE.
*/
-static inline int64
-TransactionIdToCTsPage(TransactionId xid)
-{
- return xid / (int64) COMMIT_TS_XACTS_PER_PAGE;
+static inline int64 TransactionIdToCTsPage(TransactionId xid) {
+ return xid / (int64)COMMIT_TS_XACTS_PER_PAGE;
}
-#define TransactionIdToCTsEntry(xid) \
- ((xid) % (TransactionId) COMMIT_TS_XACTS_PER_PAGE)
+#define TransactionIdToCTsEntry(xid) \
+ ((xid) % (TransactionId)COMMIT_TS_XACTS_PER_PAGE)
/*
* Link to shared-memory data structures for CommitTs control
@@ -95,30 +92,28 @@ static SlruCtlData CommitTsCtlData;
* without acquiring the lock; where this happens, a comment explains the
* rationale for it.
*/
-typedef struct CommitTimestampShared
-{
- TransactionId xidLastCommit;
- CommitTimestampEntry dataLastCommit;
- bool commitTsActive;
+typedef struct CommitTimestampShared {
+ TransactionId xidLastCommit;
+ CommitTimestampEntry dataLastCommit;
+ bool commitTsActive;
} CommitTimestampShared;
static CommitTimestampShared *commitTsShared;
-
/* GUC variable */
-bool track_commit_timestamp;
+bool track_commit_timestamp;
static void SetXidCommitTsInPage(TransactionId xid, int nsubxids,
- TransactionId *subxids, TimestampTz ts,
- RepOriginId nodeid, int64 pageno);
+ TransactionId *subxids, TimestampTz ts,
+ RepOriginId nodeid, int64 pageno);
static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
- RepOriginId nodeid, int slotno);
+ RepOriginId nodeid, int slotno);
static void error_commit_ts_disabled(void);
-static int ZeroCommitTsPage(int64 pageno, bool writeXlog);
+static int ZeroCommitTsPage(int64 pageno, bool writeXlog);
static bool CommitTsPagePrecedes(int64 page1, int64 page2);
static void ActivateCommitTs(void);
static void DeactivateCommitTs(void);
-static void WriteZeroPageXlogRec(int64 pageno);
+static XLogRecPtr WriteZeroPageXlogRec(int64 pageno);
static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid);
/*
@@ -137,107 +132,101 @@ static void WriteTruncateXlogRec(int64 pageno, TransactionId oldestXid);
* subtrans implementation changes in the future, we might want to revisit the
* decision of storing timestamp info for each subxid.
*/
-void
-TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids,
- TransactionId *subxids, TimestampTz timestamp,
- RepOriginId nodeid)
-{
- int i;
- TransactionId headxid;
- TransactionId newestXact;
-
- /*
- * No-op if the module is not active.
- *
- * An unlocked read here is fine, because in a standby (the only place
- * where the flag can change in flight) this routine is only called by the
- * recovery process, which is also the only process which can change the
- * flag.
- */
- if (!commitTsShared->commitTsActive)
- return;
-
- /*
- * Figure out the latest Xid in this batch: either the last subxid if
- * there's any, otherwise the parent xid.
- */
- if (nsubxids > 0)
- newestXact = subxids[nsubxids - 1];
- else
- newestXact = xid;
-
- /*
- * We split the xids to set the timestamp to in groups belonging to the
- * same SLRU page; the first element in each such set is its head. The
- * first group has the main XID as the head; subsequent sets use the first
- * subxid not on the previous page as head. This way, we only have to
- * lock/modify each SLRU page once.
- */
- headxid = xid;
- i = 0;
- for (;;)
- {
- int64 pageno = TransactionIdToCTsPage(headxid);
- int j;
-
- for (j = i; j < nsubxids; j++)
- {
- if (TransactionIdToCTsPage(subxids[j]) != pageno)
- break;
- }
- /* subxids[i..j] are on the same page as the head */
-
- SetXidCommitTsInPage(headxid, j - i, subxids + i, timestamp, nodeid,
- pageno);
-
- /* if we wrote out all subxids, we're done. */
- if (j >= nsubxids)
- break;
-
- /*
- * Set the new head and skip over it, as well as over the subxids we
- * just wrote.
- */
- headxid = subxids[j];
- i = j + 1;
- }
-
- /* update the cached value in shared memory */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- commitTsShared->xidLastCommit = xid;
- commitTsShared->dataLastCommit.time = timestamp;
- commitTsShared->dataLastCommit.nodeid = nodeid;
-
- /* and move forwards our endpoint, if needed */
- if (TransactionIdPrecedes(TransamVariables->newestCommitTsXid, newestXact))
- TransamVariables->newestCommitTsXid = newestXact;
- LWLockRelease(CommitTsLock);
+void TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids,
+ TransactionId *subxids,
+ TimestampTz timestamp, RepOriginId nodeid) {
+ int i;
+ TransactionId headxid;
+ TransactionId newestXact;
+
+ /*
+ * No-op if the module is not active.
+ *
+ * An unlocked read here is fine, because in a standby (the only place
+ * where the flag can change in flight) this routine is only called by the
+ * recovery process, which is also the only process which can change the
+ * flag.
+ */
+ if (!commitTsShared->commitTsActive)
+ return;
+
+ /*
+ * Figure out the latest Xid in this batch: either the last subxid if
+ * there's any, otherwise the parent xid.
+ */
+ if (nsubxids > 0)
+ newestXact = subxids[nsubxids - 1];
+ else
+ newestXact = xid;
+
+ /*
+ * We split the xids to set the timestamp to in groups belonging to the
+ * same SLRU page; the first element in each such set is its head. The
+ * first group has the main XID as the head; subsequent sets use the first
+ * subxid not on the previous page as head. This way, we only have to
+ * lock/modify each SLRU page once.
+ */
+ headxid = xid;
+ i = 0;
+ for (;;) {
+ int64 pageno = TransactionIdToCTsPage(headxid);
+ int j;
+
+ for (j = i; j < nsubxids; j++) {
+ if (TransactionIdToCTsPage(subxids[j]) != pageno)
+ break;
+ }
+ /* subxids[i..j] are on the same page as the head */
+
+ SetXidCommitTsInPage(headxid, j - i, subxids + i, timestamp, nodeid,
+ pageno);
+
+ /* if we wrote out all subxids, we're done. */
+ if (j >= nsubxids)
+ break;
+
+ /*
+ * Set the new head and skip over it, as well as over the subxids we
+ * just wrote.
+ */
+ headxid = subxids[j];
+ i = j + 1;
+ }
+
+ /* update the cached value in shared memory */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ commitTsShared->xidLastCommit = xid;
+ commitTsShared->dataLastCommit.time = timestamp;
+ commitTsShared->dataLastCommit.nodeid = nodeid;
+
+ /* and move forwards our endpoint, if needed */
+ if (TransactionIdPrecedes(TransamVariables->newestCommitTsXid, newestXact))
+ TransamVariables->newestCommitTsXid = newestXact;
+ LWLockRelease(CommitTsLock);
}
/*
* Record the commit timestamp of transaction entries in the commit log for all
* entries on a single page. Atomic only on this page.
*/
-static void
-SetXidCommitTsInPage(TransactionId xid, int nsubxids,
- TransactionId *subxids, TimestampTz ts,
- RepOriginId nodeid, int64 pageno)
-{
- LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
- int slotno;
- int i;
+static void SetXidCommitTsInPage(TransactionId xid, int nsubxids,
+ TransactionId *subxids, TimestampTz ts,
+ RepOriginId nodeid, int64 pageno) {
+ LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
+ int slotno;
+ int i;
- LWLockAcquire(lock, LW_EXCLUSIVE);
+ LWLockAcquire(lock, LW_EXCLUSIVE);
- slotno = SimpleLruReadPage(CommitTsCtl, pageno, true, xid);
+ slotno = SimpleLruReadPage(CommitTsCtl, pageno, true, xid);
- TransactionIdSetCommitTs(xid, ts, nodeid, slotno);
- for (i = 0; i < nsubxids; i++)
- TransactionIdSetCommitTs(subxids[i], ts, nodeid, slotno);
+ TransactionIdSetCommitTs(xid, ts, nodeid, slotno);
+ for (i = 0; i < nsubxids; i++)
+ TransactionIdSetCommitTs(subxids[i], ts, nodeid, slotno);
- CommitTsCtl->shared->page_dirty[slotno] = true;
+ CommitTsCtl->shared->page_dirty[slotno] = true;
- LWLockRelease(lock);
+ LWLockRelease(lock);
}
/*
@@ -245,21 +234,19 @@ SetXidCommitTsInPage(TransactionId xid, int nsubxids,
*
* Caller must hold the correct SLRU bank lock, will be held at exit
*/
-static void
-TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
- RepOriginId nodeid, int slotno)
-{
- int entryno = TransactionIdToCTsEntry(xid);
- CommitTimestampEntry entry;
+static void TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
+ RepOriginId nodeid, int slotno) {
+ int entryno = TransactionIdToCTsEntry(xid);
+ CommitTimestampEntry entry;
- Assert(TransactionIdIsNormal(xid));
+ Assert(TransactionIdIsNormal(xid));
- entry.time = ts;
- entry.nodeid = nodeid;
+ entry.time = ts;
+ entry.nodeid = nodeid;
- memcpy(CommitTsCtl->shared->page_buffer[slotno] +
- SizeOfCommitTimestampEntry * entryno,
- &entry, SizeOfCommitTimestampEntry);
+ memcpy(PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
+ SizeOfCommitTimestampEntry * entryno,
+ &entry, SizeOfCommitTimestampEntry);
}
/*
@@ -270,82 +257,79 @@ TransactionIdSetCommitTs(TransactionId xid, TimestampTz ts,
* null), and the origin node for the Xid is returned in *nodeid, if it's not
* null.
*/
-bool
-TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
- RepOriginId *nodeid)
-{
- int64 pageno = TransactionIdToCTsPage(xid);
- int entryno = TransactionIdToCTsEntry(xid);
- int slotno;
- CommitTimestampEntry entry;
- TransactionId oldestCommitTsXid;
- TransactionId newestCommitTsXid;
-
- if (!TransactionIdIsValid(xid))
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("cannot retrieve commit timestamp for transaction %u", xid)));
- else if (!TransactionIdIsNormal(xid))
- {
- /* frozen and bootstrap xids are always committed far in the past */
- *ts = 0;
- if (nodeid)
- *nodeid = 0;
- return false;
- }
-
- LWLockAcquire(CommitTsLock, LW_SHARED);
-
- /* Error if module not enabled */
- if (!commitTsShared->commitTsActive)
- error_commit_ts_disabled();
-
- /*
- * If we're asked for the cached value, return that. Otherwise, fall
- * through to read from SLRU.
- */
- if (commitTsShared->xidLastCommit == xid)
- {
- *ts = commitTsShared->dataLastCommit.time;
- if (nodeid)
- *nodeid = commitTsShared->dataLastCommit.nodeid;
-
- LWLockRelease(CommitTsLock);
- return *ts != 0;
- }
-
- oldestCommitTsXid = TransamVariables->oldestCommitTsXid;
- newestCommitTsXid = TransamVariables->newestCommitTsXid;
- /* neither is invalid, or both are */
- Assert(TransactionIdIsValid(oldestCommitTsXid) == TransactionIdIsValid(newestCommitTsXid));
- LWLockRelease(CommitTsLock);
-
- /*
- * Return empty if the requested value is outside our valid range.
- */
- if (!TransactionIdIsValid(oldestCommitTsXid) ||
- TransactionIdPrecedes(xid, oldestCommitTsXid) ||
- TransactionIdPrecedes(newestCommitTsXid, xid))
- {
- *ts = 0;
- if (nodeid)
- *nodeid = InvalidRepOriginId;
- return false;
- }
-
- /* lock is acquired by SimpleLruReadPage_ReadOnly */
- slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
- memcpy(&entry,
- CommitTsCtl->shared->page_buffer[slotno] +
- SizeOfCommitTimestampEntry * entryno,
- SizeOfCommitTimestampEntry);
-
- *ts = entry.time;
- if (nodeid)
- *nodeid = entry.nodeid;
-
- LWLockRelease(SimpleLruGetBankLock(CommitTsCtl, pageno));
- return *ts != 0;
+bool TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
+ RepOriginId *nodeid) {
+ int64 pageno = TransactionIdToCTsPage(xid);
+ int entryno = TransactionIdToCTsEntry(xid);
+ int slotno;
+ CommitTimestampEntry entry;
+ TransactionId oldestCommitTsXid;
+ TransactionId newestCommitTsXid;
+
+ if (!TransactionIdIsValid(xid))
+ ereport(
+ ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("cannot retrieve commit timestamp for transaction %u", xid)));
+ else if (!TransactionIdIsNormal(xid)) {
+ /* frozen and bootstrap xids are always committed far in the past */
+ *ts = 0;
+ if (nodeid)
+ *nodeid = 0;
+ return false;
+ }
+
+ LWLockAcquire(CommitTsLock, LW_SHARED);
+
+ /* Error if module not enabled */
+ if (!commitTsShared->commitTsActive)
+ error_commit_ts_disabled();
+
+ /*
+ * If we're asked for the cached value, return that. Otherwise, fall
+ * through to read from SLRU.
+ */
+ if (commitTsShared->xidLastCommit == xid) {
+ *ts = commitTsShared->dataLastCommit.time;
+ if (nodeid)
+ *nodeid = commitTsShared->dataLastCommit.nodeid;
+
+ LWLockRelease(CommitTsLock);
+ return *ts != 0;
+ }
+
+ oldestCommitTsXid = TransamVariables->oldestCommitTsXid;
+ newestCommitTsXid = TransamVariables->newestCommitTsXid;
+ /* neither is invalid, or both are */
+ Assert(TransactionIdIsValid(oldestCommitTsXid) ==
+ TransactionIdIsValid(newestCommitTsXid));
+ LWLockRelease(CommitTsLock);
+
+ /*
+ * Return empty if the requested value is outside our valid range.
+ */
+ if (!TransactionIdIsValid(oldestCommitTsXid) ||
+ TransactionIdPrecedes(xid, oldestCommitTsXid) ||
+ TransactionIdPrecedes(newestCommitTsXid, xid)) {
+ *ts = 0;
+ if (nodeid)
+ *nodeid = InvalidRepOriginId;
+ return false;
+ }
+
+ /* lock is acquired by SimpleLruReadPage_ReadOnly */
+ slotno = SimpleLruReadPage_ReadOnly(CommitTsCtl, pageno, xid);
+ memcpy(&entry,
+ PageGetContents(CommitTsCtl->shared->page_buffer[slotno]) +
+ SizeOfCommitTimestampEntry * entryno,
+ SizeOfCommitTimestampEntry);
+
+ *ts = entry.time;
+ if (nodeid)
+ *nodeid = entry.nodeid;
+
+ LWLockRelease(SimpleLruGetBankLock(CommitTsCtl, pageno));
+ return *ts != 0;
}
/*
@@ -356,59 +340,53 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
* ts and nodeid are filled with the corresponding data; they can be passed
* as NULL if not wanted.
*/
-TransactionId
-GetLatestCommitTsData(TimestampTz *ts, RepOriginId *nodeid)
-{
- TransactionId xid;
+TransactionId GetLatestCommitTsData(TimestampTz *ts, RepOriginId *nodeid) {
+ TransactionId xid;
- LWLockAcquire(CommitTsLock, LW_SHARED);
+ LWLockAcquire(CommitTsLock, LW_SHARED);
- /* Error if module not enabled */
- if (!commitTsShared->commitTsActive)
- error_commit_ts_disabled();
+ /* Error if module not enabled */
+ if (!commitTsShared->commitTsActive)
+ error_commit_ts_disabled();
- xid = commitTsShared->xidLastCommit;
- if (ts)
- *ts = commitTsShared->dataLastCommit.time;
- if (nodeid)
- *nodeid = commitTsShared->dataLastCommit.nodeid;
- LWLockRelease(CommitTsLock);
+ xid = commitTsShared->xidLastCommit;
+ if (ts)
+ *ts = commitTsShared->dataLastCommit.time;
+ if (nodeid)
+ *nodeid = commitTsShared->dataLastCommit.nodeid;
+ LWLockRelease(CommitTsLock);
- return xid;
+ return xid;
}
-static void
-error_commit_ts_disabled(void)
-{
- ereport(ERROR,
- (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
- errmsg("could not get commit timestamp data"),
- RecoveryInProgress() ?
- errhint("Make sure the configuration parameter \"%s\" is set on the primary server.",
- "track_commit_timestamp") :
- errhint("Make sure the configuration parameter \"%s\" is set.",
- "track_commit_timestamp")));
+static void error_commit_ts_disabled(void) {
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not get commit timestamp data"),
+ RecoveryInProgress()
+ ? errhint("Make sure the configuration parameter \"%s\" is set "
+ "on the primary server.",
+ "track_commit_timestamp")
+ : errhint("Make sure the configuration parameter \"%s\" is set.",
+ "track_commit_timestamp")));
}
/*
* SQL-callable wrapper to obtain commit time of a transaction
*/
-Datum
-pg_xact_commit_timestamp(PG_FUNCTION_ARGS)
-{
- TransactionId xid = PG_GETARG_TRANSACTIONID(0);
- TimestampTz ts;
- bool found;
+Datum pg_xact_commit_timestamp(PG_FUNCTION_ARGS) {
+ TransactionId xid = PG_GETARG_TRANSACTIONID(0);
+ TimestampTz ts;
+ bool found;
- found = TransactionIdGetCommitTsData(xid, &ts, NULL);
+ found = TransactionIdGetCommitTsData(xid, &ts, NULL);
- if (!found)
- PG_RETURN_NULL();
+ if (!found)
+ PG_RETURN_NULL();
- PG_RETURN_TIMESTAMPTZ(ts);
+ PG_RETURN_TIMESTAMPTZ(ts);
}
-
/*
* pg_last_committed_xact
*
@@ -416,42 +394,37 @@ pg_xact_commit_timestamp(PG_FUNCTION_ARGS)
* committed transaction: transaction ID, timestamp and replication
* origin.
*/
-Datum
-pg_last_committed_xact(PG_FUNCTION_ARGS)
-{
- TransactionId xid;
- RepOriginId nodeid;
- TimestampTz ts;
- Datum values[3];
- bool nulls[3];
- TupleDesc tupdesc;
- HeapTuple htup;
+Datum pg_last_committed_xact(PG_FUNCTION_ARGS) {
+ TransactionId xid;
+ RepOriginId nodeid;
+ TimestampTz ts;
+ Datum values[3];
+ bool nulls[3];
+ TupleDesc tupdesc;
+ HeapTuple htup;
- /* and construct a tuple with our data */
- xid = GetLatestCommitTsData(&ts, &nodeid);
+ /* and construct a tuple with our data */
+ xid = GetLatestCommitTsData(&ts, &nodeid);
- if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
- elog(ERROR, "return type must be a row type");
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
- if (!TransactionIdIsNormal(xid))
- {
- memset(nulls, true, sizeof(nulls));
- }
- else
- {
- values[0] = TransactionIdGetDatum(xid);
- nulls[0] = false;
+ if (!TransactionIdIsNormal(xid)) {
+ memset(nulls, true, sizeof(nulls));
+ } else {
+ values[0] = TransactionIdGetDatum(xid);
+ nulls[0] = false;
- values[1] = TimestampTzGetDatum(ts);
- nulls[1] = false;
+ values[1] = TimestampTzGetDatum(ts);
+ nulls[1] = false;
- values[2] = ObjectIdGetDatum((Oid) nodeid);
- nulls[2] = false;
- }
+ values[2] = ObjectIdGetDatum((Oid)nodeid);
+ nulls[2] = false;
+ }
- htup = heap_form_tuple(tupdesc, values, nulls);
+ htup = heap_form_tuple(tupdesc, values, nulls);
- PG_RETURN_DATUM(HeapTupleGetDatum(htup));
+ PG_RETURN_DATUM(HeapTupleGetDatum(htup));
}
/*
@@ -460,39 +433,34 @@ pg_last_committed_xact(PG_FUNCTION_ARGS)
* SQL-callable wrapper to obtain commit timestamp and replication origin
* of a given transaction.
*/
-Datum
-pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS)
-{
- TransactionId xid = PG_GETARG_TRANSACTIONID(0);
- RepOriginId nodeid;
- TimestampTz ts;
- Datum values[2];
- bool nulls[2];
- TupleDesc tupdesc;
- HeapTuple htup;
- bool found;
+Datum pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS) {
+ TransactionId xid = PG_GETARG_TRANSACTIONID(0);
+ RepOriginId nodeid;
+ TimestampTz ts;
+ Datum values[2];
+ bool nulls[2];
+ TupleDesc tupdesc;
+ HeapTuple htup;
+ bool found;
- found = TransactionIdGetCommitTsData(xid, &ts, &nodeid);
+ found = TransactionIdGetCommitTsData(xid, &ts, &nodeid);
- if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
- elog(ERROR, "return type must be a row type");
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
- if (!found)
- {
- memset(nulls, true, sizeof(nulls));
- }
- else
- {
- values[0] = TimestampTzGetDatum(ts);
- nulls[0] = false;
+ if (!found) {
+ memset(nulls, true, sizeof(nulls));
+ } else {
+ values[0] = TimestampTzGetDatum(ts);
+ nulls[0] = false;
- values[1] = ObjectIdGetDatum((Oid) nodeid);
- nulls[1] = false;
- }
+ values[1] = ObjectIdGetDatum((Oid)nodeid);
+ nulls[1] = false;
+ }
- htup = heap_form_tuple(tupdesc, values, nulls);
+ htup = heap_form_tuple(tupdesc, values, nulls);
- PG_RETURN_DATUM(HeapTupleGetDatum(htup));
+ PG_RETURN_DATUM(HeapTupleGetDatum(htup));
}
/*
@@ -502,88 +470,74 @@ pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS)
* Otherwise just cap the configured amount to be between 16 and the maximum
* allowed.
*/
-static int
-CommitTsShmemBuffers(void)
-{
- /* auto-tune based on shared buffers */
- if (commit_timestamp_buffers == 0)
- return SimpleLruAutotuneBuffers(512, 1024);
+static int CommitTsShmemBuffers(void) {
+ /* auto-tune based on shared buffers */
+ if (commit_timestamp_buffers == 0)
+ return SimpleLruAutotuneBuffers(512, 1024);
- return Min(Max(16, commit_timestamp_buffers), SLRU_MAX_ALLOWED_BUFFERS);
+ return Min(Max(16, commit_timestamp_buffers), SLRU_MAX_ALLOWED_BUFFERS);
}
/*
* Shared memory sizing for CommitTs
*/
-Size
-CommitTsShmemSize(void)
-{
- return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
- sizeof(CommitTimestampShared);
+Size CommitTsShmemSize(void) {
+ return SimpleLruShmemSize(CommitTsShmemBuffers(), 0) +
+ sizeof(CommitTimestampShared);
}
/*
* Initialize CommitTs at system startup (postmaster start or standalone
* backend)
*/
-void
-CommitTsShmemInit(void)
-{
- bool found;
-
- /* If auto-tuning is requested, now is the time to do it */
- if (commit_timestamp_buffers == 0)
- {
- char buf[32];
-
- snprintf(buf, sizeof(buf), "%d", CommitTsShmemBuffers());
- SetConfigOption("commit_timestamp_buffers", buf, PGC_POSTMASTER,
- PGC_S_DYNAMIC_DEFAULT);
-
- /*
- * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
- * However, if the DBA explicitly set commit_timestamp_buffers = 0 in
- * the config file, then PGC_S_DYNAMIC_DEFAULT will fail to override
- * that and we must force the matter with PGC_S_OVERRIDE.
- */
- if (commit_timestamp_buffers == 0) /* failed to apply it? */
- SetConfigOption("commit_timestamp_buffers", buf, PGC_POSTMASTER,
- PGC_S_OVERRIDE);
- }
- Assert(commit_timestamp_buffers != 0);
-
- CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
- SimpleLruInit(CommitTsCtl, "commit_timestamp", CommitTsShmemBuffers(), 0,
- "pg_commit_ts", LWTRANCHE_COMMITTS_BUFFER,
- LWTRANCHE_COMMITTS_SLRU,
- SYNC_HANDLER_COMMIT_TS,
- false);
- SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE);
-
- commitTsShared = ShmemInitStruct("CommitTs shared",
- sizeof(CommitTimestampShared),
- &found);
-
- if (!IsUnderPostmaster)
- {
- Assert(!found);
-
- commitTsShared->xidLastCommit = InvalidTransactionId;
- TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
- commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
- commitTsShared->commitTsActive = false;
- }
- else
- Assert(found);
+void CommitTsShmemInit(void) {
+ bool found;
+
+ /* If auto-tuning is requested, now is the time to do it */
+ if (commit_timestamp_buffers == 0) {
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "%d", CommitTsShmemBuffers());
+ SetConfigOption("commit_timestamp_buffers", buf, PGC_POSTMASTER,
+ PGC_S_DYNAMIC_DEFAULT);
+
+ /*
+ * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
+ * However, if the DBA explicitly set commit_timestamp_buffers = 0 in
+ * the config file, then PGC_S_DYNAMIC_DEFAULT will fail to override
+ * that and we must force the matter with PGC_S_OVERRIDE.
+ */
+ if (commit_timestamp_buffers == 0) /* failed to apply it? */
+ SetConfigOption("commit_timestamp_buffers", buf, PGC_POSTMASTER,
+ PGC_S_OVERRIDE);
+ }
+ Assert(commit_timestamp_buffers != 0);
+
+ CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
+ SimpleLruInit(CommitTsCtl, "commit_timestamp", CommitTsShmemBuffers(), 0,
+ "pg_commit_ts", LWTRANCHE_COMMITTS_BUFFER,
+ LWTRANCHE_COMMITTS_SLRU, SYNC_HANDLER_COMMIT_TS, false);
+ SlruPagePrecedesUnitTests(CommitTsCtl, COMMIT_TS_XACTS_PER_PAGE);
+
+ commitTsShared =
+ ShmemInitStruct("CommitTs shared", sizeof(CommitTimestampShared), &found);
+
+ if (!IsUnderPostmaster) {
+ Assert(!found);
+
+ commitTsShared->xidLastCommit = InvalidTransactionId;
+ TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
+ commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
+ commitTsShared->commitTsActive = false;
+ } else
+ Assert(found);
}
/*
* GUC check_hook for commit_timestamp_buffers
*/
-bool
-check_commit_ts_buffers(int *newval, void **extra, GucSource source)
-{
- return check_slru_buffers("commit_timestamp_buffers", newval);
+bool check_commit_ts_buffers(int *newval, void **extra, GucSource source) {
+ return check_slru_buffers("commit_timestamp_buffers", newval);
}
/*
@@ -592,14 +546,12 @@ check_commit_ts_buffers(int *newval, void **extra, GucSource source)
* (The CommitTs directory is assumed to have been created by initdb, and
* CommitTsShmemInit must have been called already.)
*/
-void
-BootStrapCommitTs(void)
-{
- /*
- * Nothing to do here at present, unlike most other SLRU modules; segments
- * are created when the server is started with this module enabled. See
- * ActivateCommitTs.
- */
+void BootStrapCommitTs(void) {
+ /*
+ * Nothing to do here at present, unlike most other SLRU modules; segments
+ * are created when the server is started with this module enabled. See
+ * ActivateCommitTs.
+ */
}
/*
@@ -611,84 +563,77 @@ BootStrapCommitTs(void)
*
* Control lock must be held at entry, and will be held at exit.
*/
-static int
-ZeroCommitTsPage(int64 pageno, bool writeXlog)
-{
- int slotno;
+static int ZeroCommitTsPage(int64 pageno, bool writeXlog) {
+ int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
- slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+ slotno = SimpleLruZeroPage(CommitTsCtl, pageno);
+ page = CommitTsCtl->shared->page_buffer[slotno];
- if (writeXlog)
- WriteZeroPageXlogRec(pageno);
+ if (writeXlog) {
+ lsn = WriteZeroPageXlogRec(pageno);
+ PageSetLSN(page, lsn);
+ }
- return slotno;
+ return slotno;
}
/*
* This must be called ONCE during postmaster or standalone-backend startup,
* after StartupXLOG has initialized TransamVariables->nextXid.
*/
-void
-StartupCommitTs(void)
-{
- ActivateCommitTs();
-}
+void StartupCommitTs(void) { ActivateCommitTs(); }
/*
* This must be called ONCE during postmaster or standalone-backend startup,
* after recovery has finished.
*/
-void
-CompleteCommitTsInitialization(void)
-{
- /*
- * If the feature is not enabled, turn it off for good. This also removes
- * any leftover data.
- *
- * Conversely, we activate the module if the feature is enabled. This is
- * necessary for primary and standby as the activation depends on the
- * control file contents at the beginning of recovery or when a
- * XLOG_PARAMETER_CHANGE is replayed.
- */
- if (!track_commit_timestamp)
- DeactivateCommitTs();
- else
- ActivateCommitTs();
+void CompleteCommitTsInitialization(void) {
+ /*
+ * If the feature is not enabled, turn it off for good. This also removes
+ * any leftover data.
+ *
+ * Conversely, we activate the module if the feature is enabled. This is
+ * necessary for primary and standby as the activation depends on the
+ * control file contents at the beginning of recovery or when a
+ * XLOG_PARAMETER_CHANGE is replayed.
+ */
+ if (!track_commit_timestamp)
+ DeactivateCommitTs();
+ else
+ ActivateCommitTs();
}
/*
* Activate or deactivate CommitTs' upon reception of a XLOG_PARAMETER_CHANGE
* XLog record during recovery.
*/
-void
-CommitTsParameterChange(bool newvalue, bool oldvalue)
-{
- /*
- * If the commit_ts module is disabled in this server and we get word from
- * the primary server that it is enabled there, activate it so that we can
- * replay future WAL records involving it; also mark it as active on
- * pg_control. If the old value was already set, we already did this, so
- * don't do anything.
- *
- * If the module is disabled in the primary, disable it here too, unless
- * the module is enabled locally.
- *
- * Note this only runs in the recovery process, so an unlocked read is
- * fine.
- */
- if (newvalue)
- {
- if (!commitTsShared->commitTsActive)
- ActivateCommitTs();
- }
- else if (commitTsShared->commitTsActive)
- DeactivateCommitTs();
+void CommitTsParameterChange(bool newvalue, bool oldvalue) {
+ /*
+ * If the commit_ts module is disabled in this server and we get word from
+ * the primary server that it is enabled there, activate it so that we can
+ * replay future WAL records involving it; also mark it as active on
+ * pg_control. If the old value was already set, we already did this, so
+ * don't do anything.
+ *
+ * If the module is disabled in the primary, disable it here too, unless
+ * the module is enabled locally.
+ *
+ * Note this only runs in the recovery process, so an unlocked read is
+ * fine.
+ */
+ if (newvalue) {
+ if (!commitTsShared->commitTsActive)
+ ActivateCommitTs();
+ } else if (commitTsShared->commitTsActive)
+ DeactivateCommitTs();
}
/*
* Activate this module whenever necessary.
- * This must happen during postmaster or standalone-backend startup,
- * or during WAL replay anytime the track_commit_timestamp setting is
+ * This must happen during postmaster or standalone-backend
+ *startup, or during WAL replay anytime the track_commit_timestamp setting is
* changed in the primary.
*
* The reason why this SLRU needs separate activation/deactivation functions is
@@ -701,67 +646,62 @@ CommitTsParameterChange(bool newvalue, bool oldvalue)
* running with this module disabled for a while and thus might have skipped
* the normal creation point.
*/
-static void
-ActivateCommitTs(void)
-{
- TransactionId xid;
- int64 pageno;
-
- /* If we've done this already, there's nothing to do */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- if (commitTsShared->commitTsActive)
- {
- LWLockRelease(CommitTsLock);
- return;
- }
- LWLockRelease(CommitTsLock);
-
- xid = XidFromFullTransactionId(TransamVariables->nextXid);
- pageno = TransactionIdToCTsPage(xid);
-
- /*
- * Re-Initialize our idea of the latest page number.
- */
- pg_atomic_write_u64(&CommitTsCtl->shared->latest_page_number, pageno);
-
- /*
- * If CommitTs is enabled, but it wasn't in the previous server run, we
- * need to set the oldest and newest values to the next Xid; that way, we
- * will not try to read data that might not have been set.
- *
- * XXX does this have a problem if a server is started with commitTs
- * enabled, then started with commitTs disabled, then restarted with it
- * enabled again? It doesn't look like it does, because there should be a
- * checkpoint that sets the value to InvalidTransactionId at end of
- * recovery; and so any chance of injecting new transactions without
- * CommitTs values would occur after the oldestCommitTsXid has been set to
- * Invalid temporarily.
- */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- if (TransamVariables->oldestCommitTsXid == InvalidTransactionId)
- {
- TransamVariables->oldestCommitTsXid =
- TransamVariables->newestCommitTsXid = ReadNextTransactionId();
- }
- LWLockRelease(CommitTsLock);
-
- /* Create the current segment file, if necessary */
- if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno))
- {
- LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
- int slotno;
-
- LWLockAcquire(lock, LW_EXCLUSIVE);
- slotno = ZeroCommitTsPage(pageno, false);
- SimpleLruWritePage(CommitTsCtl, slotno);
- Assert(!CommitTsCtl->shared->page_dirty[slotno]);
- LWLockRelease(lock);
- }
-
- /* Change the activation status in shared memory. */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- commitTsShared->commitTsActive = true;
- LWLockRelease(CommitTsLock);
+static void ActivateCommitTs(void) {
+ TransactionId xid;
+ int64 pageno;
+
+ /* If we've done this already, there's nothing to do */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (commitTsShared->commitTsActive) {
+ LWLockRelease(CommitTsLock);
+ return;
+ }
+ LWLockRelease(CommitTsLock);
+
+ xid = XidFromFullTransactionId(TransamVariables->nextXid);
+ pageno = TransactionIdToCTsPage(xid);
+
+ /*
+ * Re-Initialize our idea of the latest page number.
+ */
+ pg_atomic_write_u64(&CommitTsCtl->shared->latest_page_number, pageno);
+
+ /*
+ * If CommitTs is enabled, but it wasn't in the previous server run, we
+ * need to set the oldest and newest values to the next Xid; that way, we
+ * will not try to read data that might not have been set.
+ *
+ * XXX does this have a problem if a server is started with commitTs
+ * enabled, then started with commitTs disabled, then restarted with it
+ * enabled again? It doesn't look like it does, because there should be a
+ * checkpoint that sets the value to InvalidTransactionId at end of
+ * recovery; and so any chance of injecting new transactions without
+ * CommitTs values would occur after the oldestCommitTsXid has been set to
+ * Invalid temporarily.
+ */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (TransamVariables->oldestCommitTsXid == InvalidTransactionId) {
+ TransamVariables->oldestCommitTsXid = TransamVariables->newestCommitTsXid =
+ ReadNextTransactionId();
+ }
+ LWLockRelease(CommitTsLock);
+
+ /* Create the current segment file, if necessary */
+ if (!SimpleLruDoesPhysicalPageExist(CommitTsCtl, pageno)) {
+ LWLock *lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
+ int slotno;
+
+ LWLockAcquire(lock, LW_EXCLUSIVE);
+ slotno = ZeroCommitTsPage(pageno, false);
+ SimpleLruWritePage(CommitTsCtl, slotno);
+ Assert(!CommitTsCtl->shared->page_dirty[slotno]);
+ LWLockRelease(lock);
+ }
+
+ /* Change the activation status in shared memory. */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ commitTsShared->commitTsActive = true;
+ LWLockRelease(CommitTsLock);
}
/*
@@ -774,57 +714,53 @@ ActivateCommitTs(void)
* Resets CommitTs into invalid state to make sure we don't hand back
* possibly-invalid data; also removes segments of old data.
*/
-static void
-DeactivateCommitTs(void)
-{
- /*
- * Cleanup the status in the shared memory.
- *
- * We reset everything in the commitTsShared record to prevent user from
- * getting confusing data about last committed transaction on the standby
- * when the module was activated repeatedly on the primary.
- */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
-
- commitTsShared->commitTsActive = false;
- commitTsShared->xidLastCommit = InvalidTransactionId;
- TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
- commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
-
- TransamVariables->oldestCommitTsXid = InvalidTransactionId;
- TransamVariables->newestCommitTsXid = InvalidTransactionId;
-
- /*
- * Remove *all* files. This is necessary so that there are no leftover
- * files; in the case where this feature is later enabled after running
- * with it disabled for some time there may be a gap in the file sequence.
- * (We can probably tolerate out-of-sequence files, as they are going to
- * be overwritten anyway when we wrap around, but it seems better to be
- * tidy.)
- *
- * Note that we do this with CommitTsLock acquired in exclusive mode. This
- * is very heavy-handed, but since this routine can only be called in the
- * replica and should happen very rarely, we don't worry too much about
- * it. Note also that no process should be consulting this SLRU if we
- * have just deactivated it.
- */
- (void) SlruScanDirectory(CommitTsCtl, SlruScanDirCbDeleteAll, NULL);
-
- LWLockRelease(CommitTsLock);
+static void DeactivateCommitTs(void) {
+ /*
+ * Cleanup the status in the shared memory.
+ *
+ * We reset everything in the commitTsShared record to prevent user from
+ * getting confusing data about last committed transaction on the standby
+ * when the module was activated repeatedly on the primary.
+ */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+
+ commitTsShared->commitTsActive = false;
+ commitTsShared->xidLastCommit = InvalidTransactionId;
+ TIMESTAMP_NOBEGIN(commitTsShared->dataLastCommit.time);
+ commitTsShared->dataLastCommit.nodeid = InvalidRepOriginId;
+
+ TransamVariables->oldestCommitTsXid = InvalidTransactionId;
+ TransamVariables->newestCommitTsXid = InvalidTransactionId;
+
+ /*
+ * Remove *all* files. This is necessary so that there are no leftover
+ * files; in the case where this feature is later enabled after running
+ * with it disabled for some time there may be a gap in the file sequence.
+ * (We can probably tolerate out-of-sequence files, as they are going to
+ * be overwritten anyway when we wrap around, but it seems better to be
+ * tidy.)
+ *
+ * Note that we do this with CommitTsLock acquired in exclusive mode. This
+ * is very heavy-handed, but since this routine can only be called in the
+ * replica and should happen very rarely, we don't worry too much about
+ * it. Note also that no process should be consulting this SLRU if we
+ * have just deactivated it.
+ */
+ (void)SlruScanDirectory(CommitTsCtl, SlruScanDirCbDeleteAll, NULL);
+
+ LWLockRelease(CommitTsLock);
}
/*
* Perform a checkpoint --- either during shutdown, or on-the-fly
*/
-void
-CheckPointCommitTs(void)
-{
- /*
- * Write dirty CommitTs pages to disk. This may result in sync requests
- * queued for later handling by ProcessSyncRequests(), as part of the
- * checkpoint.
- */
- SimpleLruWriteAll(CommitTsCtl, true);
+void CheckPointCommitTs(void) {
+ /*
+ * Write dirty CommitTs pages to disk. This may result in sync requests
+ * queued for later handling by ProcessSyncRequests(), as part of the
+ * checkpoint.
+ */
+ SimpleLruWriteAll(CommitTsCtl, true);
}
/*
@@ -838,39 +774,37 @@ CheckPointCommitTs(void)
* NB: the current implementation relies on track_commit_timestamp being
* PGC_POSTMASTER.
*/
-void
-ExtendCommitTs(TransactionId newestXact)
-{
- int64 pageno;
- LWLock *lock;
+void ExtendCommitTs(TransactionId newestXact) {
+ int64 pageno;
+ LWLock *lock;
- /*
- * Nothing to do if module not enabled. Note we do an unlocked read of
- * the flag here, which is okay because this routine is only called from
- * GetNewTransactionId, which is never called in a standby.
- */
- Assert(!InRecovery);
- if (!commitTsShared->commitTsActive)
- return;
+ /*
+ * Nothing to do if module not enabled. Note we do an unlocked read of
+ * the flag here, which is okay because this routine is only called from
+ * GetNewTransactionId, which is never called in a standby.
+ */
+ Assert(!InRecovery);
+ if (!commitTsShared->commitTsActive)
+ return;
- /*
- * No work except at first XID of a page. But beware: just after
- * wraparound, the first XID of page zero is FirstNormalTransactionId.
- */
- if (TransactionIdToCTsEntry(newestXact) != 0 &&
- !TransactionIdEquals(newestXact, FirstNormalTransactionId))
- return;
+ /*
+ * No work except at first XID of a page. But beware: just after
+ * wraparound, the first XID of page zero is FirstNormalTransactionId.
+ */
+ if (TransactionIdToCTsEntry(newestXact) != 0 &&
+ !TransactionIdEquals(newestXact, FirstNormalTransactionId))
+ return;
- pageno = TransactionIdToCTsPage(newestXact);
+ pageno = TransactionIdToCTsPage(newestXact);
- lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
+ lock = SimpleLruGetBankLock(CommitTsCtl, pageno);
- LWLockAcquire(lock, LW_EXCLUSIVE);
+ LWLockAcquire(lock, LW_EXCLUSIVE);
- /* Zero the page and make an XLOG entry about it */
- ZeroCommitTsPage(pageno, !InRecovery);
+ /* Zero the page and make an XLOG entry about it */
+ ZeroCommitTsPage(pageno, !InRecovery);
- LWLockRelease(lock);
+ LWLockRelease(lock);
}
/*
@@ -879,70 +813,59 @@ ExtendCommitTs(TransactionId newestXact)
*
* Note that we don't need to flush XLOG here.
*/
-void
-TruncateCommitTs(TransactionId oldestXact)
-{
- int64 cutoffPage;
+void TruncateCommitTs(TransactionId oldestXact) {
+ int64 cutoffPage;
- /*
- * The cutoff point is the start of the segment containing oldestXact. We
- * pass the *page* containing oldestXact to SimpleLruTruncate.
- */
- cutoffPage = TransactionIdToCTsPage(oldestXact);
+ /*
+ * The cutoff point is the start of the segment containing oldestXact. We
+ * pass the *page* containing oldestXact to SimpleLruTruncate.
+ */
+ cutoffPage = TransactionIdToCTsPage(oldestXact);
- /* Check to see if there's any files that could be removed */
- if (!SlruScanDirectory(CommitTsCtl, SlruScanDirCbReportPresence,
- &cutoffPage))
- return; /* nothing to remove */
+ /* Check to see if there's any files that could be removed */
+ if (!SlruScanDirectory(CommitTsCtl, SlruScanDirCbReportPresence, &cutoffPage))
+ return; /* nothing to remove */
- /* Write XLOG record */
- WriteTruncateXlogRec(cutoffPage, oldestXact);
+ /* Write XLOG record */
+ WriteTruncateXlogRec(cutoffPage, oldestXact);
- /* Now we can remove the old CommitTs segment(s) */
- SimpleLruTruncate(CommitTsCtl, cutoffPage);
+ /* Now we can remove the old CommitTs segment(s) */
+ SimpleLruTruncate(CommitTsCtl, cutoffPage);
}
/*
* Set the limit values between which commit TS can be consulted.
*/
-void
-SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact)
-{
- /*
- * Be careful not to overwrite values that are either further into the
- * "future" or signal a disabled committs.
- */
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- if (TransamVariables->oldestCommitTsXid != InvalidTransactionId)
- {
- if (TransactionIdPrecedes(TransamVariables->oldestCommitTsXid, oldestXact))
- TransamVariables->oldestCommitTsXid = oldestXact;
- if (TransactionIdPrecedes(newestXact, TransamVariables->newestCommitTsXid))
- TransamVariables->newestCommitTsXid = newestXact;
- }
- else
- {
- Assert(TransamVariables->newestCommitTsXid == InvalidTransactionId);
- TransamVariables->oldestCommitTsXid = oldestXact;
- TransamVariables->newestCommitTsXid = newestXact;
- }
- LWLockRelease(CommitTsLock);
+void SetCommitTsLimit(TransactionId oldestXact, TransactionId newestXact) {
+ /*
+ * Be careful not to overwrite values that are either further into the
+ * "future" or signal a disabled committs.
+ */
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (TransamVariables->oldestCommitTsXid != InvalidTransactionId) {
+ if (TransactionIdPrecedes(TransamVariables->oldestCommitTsXid, oldestXact))
+ TransamVariables->oldestCommitTsXid = oldestXact;
+ if (TransactionIdPrecedes(newestXact, TransamVariables->newestCommitTsXid))
+ TransamVariables->newestCommitTsXid = newestXact;
+ } else {
+ Assert(TransamVariables->newestCommitTsXid == InvalidTransactionId);
+ TransamVariables->oldestCommitTsXid = oldestXact;
+ TransamVariables->newestCommitTsXid = newestXact;
+ }
+ LWLockRelease(CommitTsLock);
}
/*
* Move forwards the oldest commitTS value that can be consulted
*/
-void
-AdvanceOldestCommitTsXid(TransactionId oldestXact)
-{
- LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
- if (TransamVariables->oldestCommitTsXid != InvalidTransactionId &&
- TransactionIdPrecedes(TransamVariables->oldestCommitTsXid, oldestXact))
- TransamVariables->oldestCommitTsXid = oldestXact;
- LWLockRelease(CommitTsLock);
+void AdvanceOldestCommitTsXid(TransactionId oldestXact) {
+ LWLockAcquire(CommitTsLock, LW_EXCLUSIVE);
+ if (TransamVariables->oldestCommitTsXid != InvalidTransactionId &&
+ TransactionIdPrecedes(TransamVariables->oldestCommitTsXid, oldestXact))
+ TransamVariables->oldestCommitTsXid = oldestXact;
+ LWLockRelease(CommitTsLock);
}
-
/*
* Decide whether a commitTS page number is "older" for truncation purposes.
* Analogous to CLOGPagePrecedes().
@@ -985,12 +908,12 @@ CommitTsPagePrecedes(int64 page1, int64 page2)
/*
* Write a ZEROPAGE xlog record
*/
-static void
+static XLogRecPtr
WriteZeroPageXlogRec(int64 pageno)
{
XLogBeginInsert();
XLogRegisterData(&pageno, sizeof(pageno));
- (void) XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
+ return XLogInsert(RM_COMMIT_TS_ID, COMMIT_TS_ZEROPAGE);
}
/*
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 3c06ac45532..126445f3178 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -84,6 +84,7 @@
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/autovacuum.h"
+#include "storage/bufpage.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "storage/procarray.h"
@@ -107,7 +108,7 @@
*/
/* We need four bytes per offset */
-#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+#define MULTIXACT_OFFSETS_PER_PAGE (SizeOfPageContents / sizeof(MultiXactOffset))
static inline int64
MultiXactIdToOffsetPage(MultiXactId multi)
@@ -132,8 +133,8 @@ MultiXactIdToOffsetSegment(MultiXactId multi)
* additional flag bits for each TransactionId. To do this without getting
* into alignment issues, we store four bytes of flags, and then the
* corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
- * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
- * per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 408 groups
+ * per page. This wastes 8 bytes per page, but that's OK -- simplicity (and
* performance) trumps space efficiency here.
*
* Note that the "offset" macros work with byte offset, not array indexes, so
@@ -151,7 +152,7 @@ MultiXactIdToOffsetSegment(MultiXactId multi)
/* size in bytes of a complete group */
#define MULTIXACT_MEMBERGROUP_SIZE \
(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
-#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (SizeOfPageContents / MULTIXACT_MEMBERGROUP_SIZE)
#define MULTIXACT_MEMBERS_PER_PAGE \
(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
@@ -413,7 +414,7 @@ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary,
MultiXactOffset start, uint32 distance);
static bool SetOffsetVacuumLimit(bool is_startup);
static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
-static void WriteMZeroPageXlogRec(int64 pageno, uint8 info);
+static XLogRecPtr WriteMZeroPageXlogRec(int64 pageno, uint8 info);
static void WriteMTruncateXlogRec(Oid oldestMultiDB,
MultiXactId startTruncOff,
MultiXactId endTruncOff,
@@ -939,7 +940,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
* take the trouble to generalize the slru.c error reporting code.
*/
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
*offptr = offset;
@@ -994,12 +995,12 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
}
memberptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
*memberptr = members[i].xid;
flagsptr = (uint32 *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
flagsval = *flagsptr;
flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
@@ -1427,7 +1428,7 @@ retry:
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
@@ -1476,7 +1477,7 @@ retry:
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, tmpMXact);
}
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
nextMXOffset = *offptr;
@@ -1544,7 +1545,7 @@ retry:
}
xactptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
if (!TransactionIdIsValid(*xactptr))
{
@@ -1555,7 +1556,7 @@ retry:
flagsoff = MXOffsetToFlagsOffset(offset);
bshift = MXOffsetToFlagsBitShift(offset);
- flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
+ flagsptr = (uint32 *) (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + flagsoff);
ptr[truelength].xid = *xactptr;
ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
@@ -2074,11 +2075,17 @@ static int
ZeroMultiXactOffsetPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno);
+ page = MultiXactOffsetCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_OFF_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2090,11 +2097,17 @@ static int
ZeroMultiXactMemberPage(int64 pageno, bool writeXlog)
{
int slotno;
+ Page page;
+ XLogRecPtr lsn = 0;
slotno = SimpleLruZeroPage(MultiXactMemberCtl, pageno);
+ page = MultiXactMemberCtl->shared->page_buffer[slotno];
if (writeXlog)
- WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ {
+ lsn = WriteMZeroPageXlogRec(pageno, XLOG_MULTIXACT_ZERO_MEM_PAGE);
+ PageSetLSN(page, lsn);
+ }
return slotno;
}
@@ -2218,10 +2231,10 @@ TrimMultiXact(void)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
- MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
+ MemSet(offptr, 0, SizeOfPageContents - (entryno * sizeof(MultiXactOffset)));
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
LWLockRelease(lock);
@@ -2252,9 +2265,9 @@ TrimMultiXact(void)
memberoff = MXOffsetToMemberOffset(offset);
slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, true, offset);
xidptr = (TransactionId *)
- (MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
+ (PageGetContents(MultiXactMemberCtl->shared->page_buffer[slotno]) + memberoff);
- MemSet(xidptr, 0, BLCKSZ - memberoff);
+ MemSet(xidptr, 0, SizeOfPageContents - memberoff);
/*
* Note: we don't need to zero out the flag bits in the remaining
@@ -2909,7 +2922,7 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(MultiXactOffsetCtl, pageno, multi);
- offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
+ offptr = (MultiXactOffset *) PageGetContents(MultiXactOffsetCtl->shared->page_buffer[slotno]);
offptr += entryno;
offset = *offptr;
LWLockRelease(SimpleLruGetBankLock(MultiXactOffsetCtl, pageno));
@@ -3351,12 +3364,12 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
* Write an xlog record reflecting the zeroing of either a MEMBERs or
* OFFSETs page (info shows which)
*/
-static void
+static XLogRecPtr
WriteMZeroPageXlogRec(int64 pageno, uint8 info)
{
XLogBeginInsert();
XLogRegisterData(&pageno, sizeof(pageno));
- (void) XLogInsert(RM_MULTIXACT_ID, info);
+ return XLogInsert(RM_MULTIXACT_ID, info);
}
/*
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index fe56286d9a9..2500d5d6621 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -68,6 +68,7 @@
#include "access/xlogutils.h"
#include "miscadmin.h"
#include "pgstat.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/shmem.h"
#include "utils/guc.h"
@@ -169,6 +170,7 @@ typedef enum
SLRU_WRITE_FAILED,
SLRU_FSYNC_FAILED,
SLRU_CLOSE_FAILED,
+ SLRU_DATA_CORRUPTED,
} SlruErrorCause;
static SlruErrorCause slru_errcause;
@@ -391,8 +393,8 @@ SimpleLruZeroPage(SlruCtl ctl, int64 pageno)
shared->page_dirty[slotno] = true;
SlruRecentlyUsed(shared, slotno);
- /* Set the buffer to zeroes */
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ /* Initialize the page. */
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
/* Set the LSNs for this new page to zero */
SimpleLruZeroLSNs(ctl, slotno);
@@ -808,6 +810,8 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd;
+ bool checksum_failure;
+ bool verified;
SlruFileName(ctl, path, segno);
@@ -831,7 +835,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
ereport(LOG,
(errmsg("file \"%s\" doesn't exist, reading as zeroes",
path)));
- MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ PageInitSLRU(shared->page_buffer[slotno], BLCKSZ, 0);
return true;
}
@@ -854,6 +858,14 @@ SlruPhysicalReadPage(SlruCtl ctl, int64 pageno, int slotno)
return false;
}
+ if (!PageIsVerified(shared->page_buffer[slotno], pageno, PIV_LOG_WARNING,
+ &checksum_failure))
+ {
+ slru_errcause = SLRU_DATA_CORRUPTED;
+ slru_errno = 0;
+ return false;
+ }
+
return true;
}
@@ -880,6 +892,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
off_t offset = rpageno * BLCKSZ;
char path[MAXPGPATH];
int fd = -1;
+ Page page = shared->page_buffer[slotno];
+ XLogRecPtr lsn;
/* update the stats counter of written pages */
pgstat_count_slru_page_written(shared->slru_stats_idx);
@@ -888,41 +902,21 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
* Honor the write-WAL-before-data rule, if appropriate, so that we do not
* write out data before associated WAL records. This is the same action
* performed during FlushBuffer() in the main buffer manager.
+ *
+ * The largest async-commit LSN for the page is maintained through page LSN.
*/
- if (shared->group_lsn != NULL)
+ lsn = PageGetLSN(page);
+ if (!XLogRecPtrIsInvalid(lsn))
{
/*
- * We must determine the largest async-commit LSN for the page. This
- * is a bit tedious, but since this entire function is a slow path
- * anyway, it seems better to do this here than to maintain a per-page
- * LSN variable (which'd need an extra comparison in the
- * transaction-commit path).
+ * As noted above, elog(ERROR) is not acceptable here, so if
+ * XLogFlush were to fail, we must PANIC. This isn't much of a
+ * restriction because XLogFlush is just about all critical
+ * section anyway, but let's make sure.
*/
- XLogRecPtr max_lsn;
- int lsnindex;
-
- lsnindex = slotno * shared->lsn_groups_per_page;
- max_lsn = shared->group_lsn[lsnindex++];
- for (int lsnoff = 1; lsnoff < shared->lsn_groups_per_page; lsnoff++)
- {
- XLogRecPtr this_lsn = shared->group_lsn[lsnindex++];
-
- if (max_lsn < this_lsn)
- max_lsn = this_lsn;
- }
-
- if (!XLogRecPtrIsInvalid(max_lsn))
- {
- /*
- * As noted above, elog(ERROR) is not acceptable here, so if
- * XLogFlush were to fail, we must PANIC. This isn't much of a
- * restriction because XLogFlush is just about all critical
- * section anyway, but let's make sure.
- */
- START_CRIT_SECTION();
- XLogFlush(max_lsn);
- END_CRIT_SECTION();
- }
+ START_CRIT_SECTION();
+ XLogFlush(lsn);
+ END_CRIT_SECTION();
}
/*
@@ -987,6 +981,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int64 pageno, int slotno, SlruWriteAll fdata)
}
}
+ PageSetChecksumInplace(shared->page_buffer[slotno], pageno);
+
errno = 0;
pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
@@ -1107,6 +1103,13 @@ SlruReportIOError(SlruCtl ctl, int64 pageno, TransactionId xid)
errdetail("Could not close file \"%s\": %m.",
path)));
break;
+ case SLRU_DATA_CORRUPTED:
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not access status of transaction %u", xid),
+ errdetail("Invalid page from file \"%s\" at offset %d.",
+ path, offset)));
+ break;
default:
/* can't get here, we trust */
elog(ERROR, "unrecognized SimpleLru error cause: %d",
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 15153618fad..13bf26a942b 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -34,6 +34,7 @@
#include "miscadmin.h"
#include "pg_trace.h"
#include "utils/guc_hooks.h"
+#include "storage/bufpage.h"
#include "utils/snapmgr.h"
@@ -51,7 +52,7 @@
*/
/* We need four bytes per xact */
-#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+#define SUBTRANS_XACTS_PER_PAGE (SizeOfPageContents / sizeof(TransactionId))
/*
* Although we return an int64 the actual value can't currently exceed
@@ -97,7 +98,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
/*
@@ -137,7 +138,7 @@ SubTransGetParent(TransactionId xid)
/* lock is acquired by SimpleLruReadPage_ReadOnly */
slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid);
- ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
+ ptr = (TransactionId *) PageGetContents(SubTransCtl->shared->page_buffer[slotno]);
ptr += entryno;
parent = *ptr;
@@ -366,7 +367,6 @@ CheckPointSUBTRANS(void)
TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
}
-
/*
* Make sure that SUBTRANS has room for a newly-allocated XID.
*
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 4bd37d5beb5..ce4d95b7719 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -140,6 +140,7 @@
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/procsignal.h"
@@ -160,7 +161,7 @@
* than that, so changes in that data structure won't affect user-visible
* restrictions.
*/
-#define NOTIFY_PAYLOAD_MAX_LENGTH (BLCKSZ - NAMEDATALEN - 128)
+#define NOTIFY_PAYLOAD_MAX_LENGTH (SizeOfPageContents - NAMEDATALEN - 128)
/*
* Struct representing an entry in the global notify queue
@@ -309,7 +310,7 @@ static SlruCtlData NotifyCtlData;
#define NotifyCtl (&NotifyCtlData)
#define QUEUE_PAGESIZE BLCKSZ
-
+#define QUEUE_PAGE_CAPACITY (QUEUE_PAGESIZE - MAXALIGN(SizeOfPageHeaderData))
#define QUEUE_FULL_WARN_INTERVAL 5000 /* warn at most once every 5s */
/*
@@ -1295,14 +1296,14 @@ asyncQueueAdvance(volatile QueuePosition *position, int entryLength)
* written or read.
*/
offset += entryLength;
- Assert(offset <= QUEUE_PAGESIZE);
+ Assert(offset <= QUEUE_PAGE_CAPACITY);
/*
* In a second step check if another entry can possibly be written to the
* page. If so, stay here, we have reached the next position. If not, then
* we need to move on to the next page.
*/
- if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGESIZE)
+ if (offset + QUEUEALIGN(AsyncQueueEntryEmptySize) > QUEUE_PAGE_CAPACITY)
{
pageno++;
offset = 0;
@@ -1405,7 +1406,7 @@ asyncQueueAddEntries(ListCell *nextNotify)
offset = QUEUE_POS_OFFSET(queue_head);
/* Check whether the entry really fits on the current page */
- if (offset + qe.length <= QUEUE_PAGESIZE)
+ if (offset + qe.length <= QUEUE_PAGE_CAPACITY)
{
/* OK, so advance nextNotify past this item */
nextNotify = lnext(pendingNotifies->events, nextNotify);
@@ -1417,14 +1418,14 @@ asyncQueueAddEntries(ListCell *nextNotify)
* only check dboid and since it won't match any reader's database
* OID, they will ignore this entry and move on.
*/
- qe.length = QUEUE_PAGESIZE - offset;
+ qe.length = QUEUE_PAGE_CAPACITY - offset;
qe.dboid = InvalidOid;
qe.data[0] = '\0'; /* empty channel */
qe.data[1] = '\0'; /* empty payload */
}
/* Now copy qe into the shared buffer page */
- memcpy(NotifyCtl->shared->page_buffer[slotno] + offset,
+ memcpy(PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + offset,
&qe,
qe.length);
@@ -1955,10 +1956,10 @@ asyncQueueReadAllNotifications(void)
else
{
/* fetch all the rest of the page */
- copysize = QUEUE_PAGESIZE - curoffset;
+ copysize = QUEUE_PAGE_CAPACITY - curoffset;
}
- memcpy(page_buffer.buf + curoffset,
- NotifyCtl->shared->page_buffer[slotno] + curoffset,
+ memcpy(PageGetContents(page_buffer.buf) + curoffset,
+ PageGetContents(NotifyCtl->shared->page_buffer[slotno]) + curoffset,
copysize);
/* Release lock that we got from SimpleLruReadPage_ReadOnly() */
LWLockRelease(SimpleLruGetBankLock(NotifyCtl, curpage));
@@ -2029,7 +2030,7 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current,
if (QUEUE_POS_EQUAL(thisentry, stop))
break;
- qe = (AsyncQueueEntry *) (page_buffer + QUEUE_POS_OFFSET(thisentry));
+ qe = (AsyncQueueEntry *) (PageGetContents(page_buffer) + QUEUE_POS_OFFSET(thisentry));
/*
* Advance *current over this message, possibly to the next page. As
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index d82114ffca1..39fd1afbbba 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -207,6 +207,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "port/pg_lfind.h"
+#include "storage/bufpage.h"
#include "storage/predicate.h"
#include "storage/predicate_internals.h"
#include "storage/proc.h"
@@ -326,8 +327,8 @@ static SlruCtlData SerialSlruCtlData;
#define SerialSlruCtl (&SerialSlruCtlData)
#define SERIAL_PAGESIZE BLCKSZ
-#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
-#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE - MAXALIGN(SizeOfPageHeaderData) / SERIAL_ENTRYSIZE)
/*
* Set maximum pages based on the number needed to track all transactions.
@@ -337,7 +338,7 @@ static SlruCtlData SerialSlruCtlData;
#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
- (SerialSlruCtl->shared->page_buffer[slotno] + \
+ (PageGetContents(SerialSlruCtl->shared->page_buffer[slotno]) + \
((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
@@ -789,10 +790,13 @@ SerialPagePrecedesLogicallyUnitTests(void)
* requires burning ~2B XIDs in single-user mode, a negligible
* possibility. Moreover, if it does happen, the consequence would be
* mild, namely a new transaction failing in SimpleLruReadPage().
+ *
+ * NOTE: After adding the page header, the defect affects two pages.
+ * We now assert correct treatment of its second to prior page.
*/
headPage = oldestPage;
targetPage = newestPage;
- Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+ Assert(SerialPagePrecedesLogically(headPage, targetPage - 2));
#if 0
Assert(SerialPagePrecedesLogically(headPage, targetPage));
#endif
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index dbb49ed9197..7819f9480a8 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -59,6 +59,32 @@ PageInit(Page page, Size pageSize, Size specialSize)
/* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
}
+/*
+ * PageInitSLRU
+ * Initializes the contents of an SLRU page.
+ * Note that we don't calculate an initial checksum here; that's not done
+ * until it's time to write.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
+
/*
* PageIsVerified
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index f20be82862a..7ba06e09d4a 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -16,6 +16,7 @@
#include <dirent.h>
#include <limits.h>
+#include <stdbool.h>
#include <sys/stat.h>
#include <time.h>
#include <unistd.h>
@@ -592,12 +593,20 @@ main(int argc, char *argv[])
{
total_size = scan_directory(DataDir, "global", true);
total_size += scan_directory(DataDir, "base", true);
+ total_size += scan_directory(DataDir, "pg_commit_ts", true);
+ total_size += scan_directory(DataDir, "pg_multixact", true);
+ total_size += scan_directory(DataDir, "pg_serial", true);
total_size += scan_directory(DataDir, PG_TBLSPC_DIR, true);
+ total_size += scan_directory(DataDir, "pg_xact", true);
}
(void) scan_directory(DataDir, "global", false);
(void) scan_directory(DataDir, "base", false);
+ (void) scan_directory(DataDir, "pg_commit_ts", false);
+ (void) scan_directory(DataDir, "pg_multixact", false);
+ (void) scan_directory(DataDir, "pg_serial", false);
(void) scan_directory(DataDir, PG_TBLSPC_DIR, false);
+ (void) scan_directory(DataDir, "pg_xact", false);
if (showprogress)
progress_report(true);
diff --git a/src/bin/pg_resetwal/t/001_basic.pl b/src/bin/pg_resetwal/t/001_basic.pl
index d6bbbd0ceda..0d31cdb2055 100644
--- a/src/bin/pg_resetwal/t/001_basic.pl
+++ b/src/bin/pg_resetwal/t/001_basic.pl
@@ -213,7 +213,7 @@ push @cmd,
sprintf("%d,%d", hex($files[0]) == 0 ? 3 : hex($files[0]), hex($files[-1]));
@files = get_slru_files('pg_multixact/offsets');
-$mult = 32 * $blcksz / 4;
+$mult = 32 * ($blcksz-24) / 4;
# --multixact-ids argument is "new,old"
push @cmd,
'--multixact-ids' => sprintf("%d,%d",
@@ -221,11 +221,11 @@ push @cmd,
hex($files[0]) == 0 ? 1 : hex($files[0] * $mult));
@files = get_slru_files('pg_multixact/members');
-$mult = 32 * int($blcksz / 20) * 4;
+$mult = 32 * int(($blcksz - 24) / 20) * 4;
push @cmd, '--multixact-offset' => (hex($files[-1]) + 1) * $mult;
@files = get_slru_files('pg_xact');
-$mult = 32 * $blcksz * 4;
+$mult = 32 * ($blcksz - 24) * 4;
push @cmd,
'--oldest-transaction-id' =>
(hex($files[0]) == 0 ? 3 : hex($files[0]) * $mult),
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index 91ed16acb08..2831dd5ec2b 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -9,6 +9,7 @@
#include "postgres_fe.h"
+#include <dirent.h>
#include <sys/stat.h>
#include <limits.h>
#include <fcntl.h>
@@ -457,3 +458,180 @@ check_hard_link(transferMode transfer_mode)
unlink(new_link_file);
}
+
+
+/*
+ * Copy SLRU_PAGES_PER_SEGMENT from access/slru.h to avoid including it.
+ */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+#define SEGMENT_SIZE (BLCKSZ * SLRU_PAGES_PER_SEGMENT)
+
+/*
+ * Copy PageInitSLRU from storage/bufpage.c to avoid linking to the backend.
+ */
+void
+PageInitSLRU(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_SLRU_PAGE_LAYOUT_VERSION);
+}
+
+/*
+ * Filter function for scandir(3) to select only segment files.
+ */
+static int
+segment_file_filter(const struct dirent *dirent)
+{
+ return strspn(dirent->d_name, "0123456789ABCDEF") == strlen(dirent->d_name);
+}
+
+/*
+ * Upgrade a single clog segment to add a page header on each page.
+ */
+static void
+upgrade_file(const char *src_dir, const char *src_file, const char *dst_dir)
+{
+ char src[MAXPGPATH];
+ char dst[MAXPGPATH];
+
+ int seg_name_len;
+ int src_segno;
+ int64 src_pageno;
+ int dst_segno;
+ int64 dst_pageno;
+ int dst_offset;
+
+ int src_fd;
+ int dst_fd;
+
+ char *src_buf;
+ ssize_t src_len;
+ ssize_t src_buf_offset;
+ PGAlignedBlock dst_block;
+ Page page = dst_block.data;
+ int len_to_copy;
+
+ seg_name_len = strlen(src_file);
+ src_segno = (int) strtol(src_file, NULL, 16);
+ src_pageno = src_segno * SLRU_PAGES_PER_SEGMENT;
+
+ dst_pageno = src_pageno * BLCKSZ / SizeOfPageContents;
+ dst_offset = src_pageno * BLCKSZ - dst_pageno * SizeOfPageContents;
+ dst_segno = dst_pageno / SLRU_PAGES_PER_SEGMENT;
+
+ snprintf(src, sizeof(src), "%s/%s", src_dir, src_file);
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+
+ src_buf = pg_malloc(SEGMENT_SIZE);
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) == -1)
+ pg_fatal("could not open file \"%s\": %s", src, strerror(errno));
+ if ((src_len = read(src_fd, src_buf, SEGMENT_SIZE)) == -1)
+ pg_fatal("could not read file \"%s\": %s", src, strerror(errno));
+
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+
+ /*
+ * Read the destination page at dst_pageno into the buffer. The page may contain
+ * data from the previous source segment. Initialize the page if the page is new.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+ if (read(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not read file \"%s\": %s", dst, strerror(errno));
+ if (PageIsNew(page))
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Rewind the file position, so the first write will overwrite the page.
+ */
+ if (lseek(dst_fd, (dst_pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ, SEEK_SET) == -1)
+ pg_fatal("could not seek in file \"%s\": %s", dst, strerror(errno));
+
+ src_buf_offset = 0;
+ while (src_buf_offset < src_len)
+ {
+ len_to_copy = Min(src_len - src_buf_offset, SizeOfPageContents - dst_offset);
+ memcpy(PageGetContents(page) + dst_offset, src_buf + src_buf_offset, len_to_copy);
+ src_buf_offset += len_to_copy;
+
+ if (new_cluster.controldata.data_checksum_version > 0)
+ ((PageHeader) page)->pd_checksum = pg_checksum_page(page, dst_pageno);
+ if (write(dst_fd, page, BLCKSZ) == -1)
+ pg_fatal("could not write file \"%s\": %s", dst, strerror(errno));
+
+ dst_pageno++;
+ dst_offset = 0;
+ PageInitSLRU(page, BLCKSZ, 0);
+
+ /*
+ * Switch segments if we reached the end of the current segment.
+ */
+ if (dst_pageno % SLRU_PAGES_PER_SEGMENT == 0)
+ {
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ dst_segno++;
+ snprintf(dst, sizeof(dst), "%s/%0*X", dst_dir, seg_name_len, dst_segno);
+ if ((dst_fd = open(dst, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
+ pg_fatal("could not open file \"%s\": %s", dst, strerror(errno));
+ if (ftruncate(dst_fd, SEGMENT_SIZE) == -1)
+ pg_fatal("could not truncate file \"%s\": %s", dst, strerror(errno));
+ }
+ }
+
+ if (fsync(dst_fd) == -1)
+ pg_fatal("could not fsync file \"%s\": %s", dst, strerror(errno));
+ if (close(dst_fd) == -1)
+ pg_fatal("could not close file \"%s\": %s", dst, strerror(errno));
+
+ pg_free(src_buf);
+ close(src_fd);
+}
+
+/*
+ * Upgrade the clog files to add a page header to each SLRU page.
+ */
+void
+upgrade_xact_cache(const char *src_subdir, const char *dst_subdir)
+{
+ char src_dir[MAXPGPATH];
+ char dst_dir[MAXPGPATH];
+
+ DIR *src_dirp;
+ struct dirent *src_dirent;
+
+ snprintf(src_dir, sizeof(src_dir), "%s/%s", old_cluster.pgdata, src_subdir);
+ snprintf(dst_dir, sizeof(dst_dir), "%s/%s", new_cluster.pgdata, dst_subdir);
+
+ if ((src_dirp = opendir(src_dir)) == NULL)
+ pg_fatal("could not open directory \"%s\": %s", src_dir, strerror(errno));
+
+ while (errno = 0, (src_dirent = readdir(src_dirp)) != NULL)
+ {
+ if (segment_file_filter(src_dirent))
+ upgrade_file(src_dir, src_dirent->d_name, dst_dir);
+ }
+
+ if (closedir(src_dirp) != 0)
+ pg_fatal("could not close directory \"%s\": %s", src_dir, strerror(errno));
+}
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 536e49d2616..bad7613e291 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -749,14 +749,28 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir)
static void
copy_xact_xlog_xid(void)
{
+ bool slru_header_changed = false;
+
/*
* Copy old commit logs to new data dir. pg_clog has been renamed to
* pg_xact in post-10 clusters.
*/
- copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact",
- GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ?
- "pg_clog" : "pg_xact");
+ char *xact_old_dir = GET_MAJOR_VERSION(old_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+ char *xact_new_dir = GET_MAJOR_VERSION(new_cluster.major_version) <= 906 ? "pg_clog" : "pg_xact";
+
+ /*
+ * In post-17 clusters, a page header is added to each SLRU page.
+ * Perform a one-time conversion of the clog files if the old
+ * cluster and the new cluster use different SLRU formats.
+ */
+ if (new_cluster.controldata.cat_ver >= SLRU_PAGE_HEADER_CAT_VER &&
+ old_cluster.controldata.cat_ver < SLRU_PAGE_HEADER_CAT_VER)
+ slru_header_changed = true;
+
+ if (slru_header_changed)
+ upgrade_xact_cache(xact_old_dir, xact_new_dir);
+ else
+ copy_subdir_files(xact_old_dir, xact_new_dir);
prep_status("Setting oldest XID for new cluster");
exec_prog(UTILITY_LOG_FILE, NULL, true, true,
@@ -791,7 +805,8 @@ copy_xact_xlog_xid(void)
* server doesn't attempt to read multis older than the cutoff value.
*/
if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
- new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
+ !slru_header_changed)
{
copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
copy_subdir_files("pg_multixact/members", "pg_multixact/members");
@@ -811,7 +826,8 @@ copy_xact_xlog_xid(void)
new_cluster.pgdata);
check_ok();
}
- else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+ else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER ||
+ slru_header_changed)
{
/*
* Remove offsets/0000 file created by initdb that no longer matches
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 69c965bb7d0..34b03df2f08 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -114,6 +114,11 @@ extern char *output_files[];
*/
#define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
+/*
+ * A page header was added to each SLRU page in 18.0.
+ */
+#define SLRU_PAGE_HEADER_CAT_VER 202506261
+
/*
* large object chunk size added to pg_controldata,
* commit 5f93c37805e7485488480916b4585e098d3cc883
@@ -425,6 +430,7 @@ void rewriteVisibilityMap(const char *fromfile, const char *tofile,
void check_file_clone(void);
void check_copy_file_range(void);
void check_hard_link(transferMode transfer_mode);
+void upgrade_xact_cache(const char *src_subdir, const char *dst_subdir);
/* fopen_priv() is no longer different from fopen() */
#define fopen_priv(path, mode) fopen(path, mode)
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index d63db42ed7b..c1e086be1ea 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202506251
+#define CATALOG_VERSION_NO 202506261
#endif
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index aeb67c498c5..4c9ab9302bb 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -205,6 +205,7 @@ typedef PageHeaderData *PageHeader;
* handling pages.
*/
#define PG_PAGE_LAYOUT_VERSION 4
+#define PG_SLRU_PAGE_LAYOUT_VERSION 1
#define PG_DATA_CHECKSUM_VERSION 1
/* ----------------------------------------------------------------
@@ -261,6 +262,11 @@ PageGetContents(Page page)
return (char *) page + MAXALIGN(SizeOfPageHeaderData);
}
+/*
+ * Space available for storing page contents.
+ */
+#define SizeOfPageContents (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
/* ----------------
* functions to access page size info
* ----------------
@@ -486,6 +492,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
"BLCKSZ has to be a multiple of sizeof(size_t)");
extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern void PageInitSLRU(Page page, Size pageSize, Size specialSize);
extern bool PageIsVerified(PageData *page, BlockNumber blkno, int flags,
bool *checksum_failure_p);
extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
diff --git a/src/test/modules/test_slru/test_slru.c b/src/test/modules/test_slru/test_slru.c
index 32750930e43..6b24b19e266 100644
--- a/src/test/modules/test_slru/test_slru.c
+++ b/src/test/modules/test_slru/test_slru.c
@@ -17,6 +17,7 @@
#include "access/slru.h"
#include "access/transam.h"
#include "miscadmin.h"
+#include "storage/bufpage.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
@@ -72,8 +73,8 @@ test_slru_page_write(PG_FUNCTION_ARGS)
TestSlruCtl->shared->page_status[slotno] = SLRU_PAGE_VALID;
/* write given data to the page, up to the limit of the page */
- strncpy(TestSlruCtl->shared->page_buffer[slotno], data,
- BLCKSZ - 1);
+ strncpy(PageGetContents(TestSlruCtl->shared->page_buffer[slotno]), data,
+ SizeOfPageContents - 1);
SimpleLruWritePage(TestSlruCtl, slotno);
LWLockRelease(lock);
@@ -101,7 +102,7 @@ test_slru_page_read(PG_FUNCTION_ARGS)
LWLockAcquire(lock, LW_EXCLUSIVE);
slotno = SimpleLruReadPage(TestSlruCtl, pageno,
write_ok, InvalidTransactionId);
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(lock);
PG_RETURN_TEXT_P(cstring_to_text(data));
@@ -120,7 +121,7 @@ test_slru_page_readonly(PG_FUNCTION_ARGS)
pageno,
InvalidTransactionId);
Assert(LWLockHeldByMe(lock));
- data = (char *) TestSlruCtl->shared->page_buffer[slotno];
+ data = (char *) PageGetContents(TestSlruCtl->shared->page_buffer[slotno]);
LWLockRelease(lock);
PG_RETURN_TEXT_P(cstring_to_text(data));
--
2.47.1