From e58543b92bcbfc0061964341cecf93fa813dc77e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 13 Aug 2024 20:26:40 +0300
Subject: [PATCH v5 2/5] Use CSN snapshots during Hot Standby

Replace the known-assigned-XIDs mechanism with a CSN log. The CSN log
(pg_csn) tracks the commit LSN of each transaction, when replaying the
WAL on a standby. It's only used on the standby, and is initialized
from scratch at server startup like pg_subtrans.

Based on 0001-CSN-base-snapshot.patch from
https://www.postgresql.org/message-id/2020081009525213277261%40highgo.ca.
This patch has a long lineage, various CSN patches have been posted
with parts from Stas Kelvich, Movead Li, Ants Aasma, Heikki
Linnakangas, Alexander Kuzmenkov
---
 contrib/pg_visibility/pg_visibility.c         |    1 +
 src/backend/access/rmgrdesc/xactdesc.c        |   26 -
 src/backend/access/transam/Makefile           |    1 +
 src/backend/access/transam/csn_log.c          |  474 ++++++
 src/backend/access/transam/meson.build        |    1 +
 src/backend/access/transam/transam.c          |    3 +
 src/backend/access/transam/twophase.c         |   34 +-
 src/backend/access/transam/varsup.c           |    1 +
 src/backend/access/transam/xact.c             |  138 +-
 src/backend/access/transam/xlog.c             |  118 +-
 src/backend/access/transam/xlogrecovery.c     |   13 +-
 src/backend/access/transam/xlogutils.c        |    2 +-
 src/backend/postmaster/startup.c              |    2 +-
 src/backend/replication/logical/decode.c      |    8 -
 src/backend/replication/logical/snapbuild.c   |    2 +-
 src/backend/storage/ipc/ipci.c                |    3 +
 src/backend/storage/ipc/procarray.c           | 1512 ++---------------
 src/backend/storage/ipc/standby.c             |  102 +-
 src/backend/storage/lmgr/lwlock.c             |    2 +
 .../utils/activity/wait_event_names.txt       |    1 +
 src/backend/utils/probes.d                    |    2 +
 src/backend/utils/time/snapmgr.c              |   37 +-
 src/bin/initdb/initdb.c                       |    3 +-
 src/include/access/csn_log.h                  |   30 +
 src/include/access/transam.h                  |    3 +
 src/include/access/twophase.h                 |    3 +-
 src/include/access/xact.h                     |   12 +-
 src/include/access/xlogutils.h                |   33 +-
 src/include/storage/lwlock.h                  |    2 +
 src/include/storage/procarray.h               |   13 +-
 src/include/utils/snapshot.h                  |    7 +
 31 files changed, 821 insertions(+), 1768 deletions(-)
 create mode 100644 src/backend/access/transam/csn_log.c
 create mode 100644 src/include/access/csn_log.h

diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c
index 5d0deaba61..7905a91412 100644
--- a/contrib/pg_visibility/pg_visibility.c
+++ b/contrib/pg_visibility/pg_visibility.c
@@ -581,6 +581,7 @@ collect_visibility_data(Oid relid, bool include_pd)
  *    now perform minimal checking on a standby by always using nextXid, this
  *    approach is better than nothing and will at least catch extremely broken
  *    cases where a xid is in the future.
+ *    XXX KnownAssignedXids is gone.
  * 3. Ignore walsender xmin, because it could go backward if some replication
  *    connections don't use replication slots.
  *
diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c
index 889cb955c1..128486e751 100644
--- a/src/backend/access/rmgrdesc/xactdesc.c
+++ b/src/backend/access/rmgrdesc/xactdesc.c
@@ -424,17 +424,6 @@ xact_desc_prepare(StringInfo buf, uint8 info, xl_xact_prepare *xlrec, RepOriginI
 						 timestamptz_to_str(parsed.origin_timestamp));
 }
 
-static void
-xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec)
-{
-	int			i;
-
-	appendStringInfoString(buf, "subxacts:");
-
-	for (i = 0; i < xlrec->nsubxacts; i++)
-		appendStringInfo(buf, " %u", xlrec->xsub[i]);
-}
-
 void
 xact_desc(StringInfo buf, XLogReaderState *record)
 {
@@ -462,18 +451,6 @@ xact_desc(StringInfo buf, XLogReaderState *record)
 		xact_desc_prepare(buf, XLogRecGetInfo(record), xlrec,
 						  XLogRecGetOrigin(record));
 	}
-	else if (info == XLOG_XACT_ASSIGNMENT)
-	{
-		xl_xact_assignment *xlrec = (xl_xact_assignment *) rec;
-
-		/*
-		 * Note that we ignore the WAL record's xid, since we're more
-		 * interested in the top-level xid that issued the record and which
-		 * xids are being reported here.
-		 */
-		appendStringInfo(buf, "xtop %u: ", xlrec->xtop);
-		xact_desc_assignment(buf, xlrec);
-	}
 	else if (info == XLOG_XACT_INVALIDATIONS)
 	{
 		xl_xact_invals *xlrec = (xl_xact_invals *) rec;
@@ -505,9 +482,6 @@ xact_identify(uint8 info)
 		case XLOG_XACT_ABORT_PREPARED:
 			id = "ABORT_PREPARED";
 			break;
-		case XLOG_XACT_ASSIGNMENT:
-			id = "ASSIGNMENT";
-			break;
 		case XLOG_XACT_INVALIDATIONS:
 			id = "INVALIDATION";
 			break;
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 661c55a9db..2520d77c7c 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -15,6 +15,7 @@ include $(top_builddir)/src/Makefile.global
 OBJS = \
 	clog.o \
 	commit_ts.o \
+	csn_log.o \
 	generic_xlog.o \
 	multixact.o \
 	parallel.o \
diff --git a/src/backend/access/transam/csn_log.c b/src/backend/access/transam/csn_log.c
new file mode 100644
index 0000000000..1188a78c4a
--- /dev/null
+++ b/src/backend/access/transam/csn_log.c
@@ -0,0 +1,474 @@
+/*-----------------------------------------------------------------------------
+ *
+ * csn_log.c
+ *		Track commit record LSNs of finished transactions
+ *
+ * This module provides an SLRU to store the LSN of the commit record of each
+ * transaction. CSN stands for Commit Sequence Number, and in principle we
+ * could use a separate counter that is incremented at every commit. For
+ * simplicity, though, we use the commit records LSN as the sequence number.
+ *
+ * Like pg_subtrans, this mapping need to be kept only for xid's greater then
+ * oldestXmin, and doesn't need to be preserved over crashes.  Also, this is
+ * only needed in hot standby mode, and immediately after exiting hot standby
+ * mode, until all old snapshots taken during standby mode are gone.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/csn_log.c
+ *
+ *-----------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/csn_log.h"
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "utils/snapmgr.h"
+
+/*
+ * Defines for CSNLog page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * CSNLog page numbering also wraps around at
+ * 0xFFFFFFFF/CSN_LOG_XACTS_PER_PAGE, and CSNLog segment numbering at
+ * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateCSNLog (see CSNLogPagePrecedes).
+ */
+
+/* We store the commit CSN for each xid */
+#define CSN_LOG_XACTS_PER_PAGE (BLCKSZ / sizeof(XLogRecPtr))
+
+#define TransactionIdToPage(xid)	((xid) / (TransactionId) CSN_LOG_XACTS_PER_PAGE)
+#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CSN_LOG_XACTS_PER_PAGE)
+
+#define PgIndexToTransactionId(pageno, idx) (CSN_LOG_XACTS_PER_PAGE * (pageno) + idx)
+
+
+
+/*
+ * Link to shared-memory data structures for CSNLog control
+ */
+static SlruCtlData CSNLogCtlData;
+#define CsnlogCtl (&CSNLogCtlData)
+
+static int	ZeroCSNLogPage(int pageno);
+static bool CSNLogPagePrecedes(int64 page1, int64 page2);
+static void CSNLogSetPageStatus(TransactionId xid, int nsubxids,
+								TransactionId *subxids,
+								XLogRecPtr csn, int pageno);
+static void CSNLogSetCSNInSlot(TransactionId xid, XLogRecPtr csn,
+							   int slotno);
+
+
+/*
+ * Record commit LSN of a transaction and its subtransaction tree.
+ *
+ * xid is a single xid to set status for. This will typically be the top level
+ * transaction ID for a top level commit.
+ *
+ * subxids is an array of xids of length nsubxids, in logical XID order,
+ * representing subtransactions in the tree of XIDs. In various cases nsubxids
+ * may be zero.
+ *
+ * commitLsn is the LSN of the commit record.  This is currently never called
+ * for aborted transactions.
+ */
+void
+CSNLogSetCSN(TransactionId xid, int nsubxids, TransactionId *subxids,
+			 XLogRecPtr commitLsn)
+{
+	int			pageno;
+	int			i = 0;
+	int			offset = 0;
+
+	Assert(TransactionIdIsValid(xid));
+
+	pageno = TransactionIdToPage(xid);	/* get page of parent */
+	for (;;)
+	{
+		int			num_on_page = 0;
+
+		while (i < nsubxids && TransactionIdToPage(subxids[i]) == pageno)
+		{
+			num_on_page++;
+			i++;
+		}
+
+		CSNLogSetPageStatus(xid,
+							num_on_page, subxids + offset,
+							commitLsn, pageno);
+		if (i >= nsubxids)
+			break;
+
+		offset = i;
+		pageno = TransactionIdToPage(subxids[offset]);
+		xid = InvalidTransactionId;
+	}
+}
+
+/*
+ * Record the final state of transaction entries in the CSN log for all
+ * entries on a single page.  Atomic only on this page.
+ *
+ * Otherwise API is same as CSNLogSetCSN()
+ */
+static void
+CSNLogSetPageStatus(TransactionId xid, int nsubxids, TransactionId *subxids,
+					XLogRecPtr commitLsn, int pageno)
+{
+	int			slotno;
+	int			i;
+	LWLock	   *lock;
+
+	lock = SimpleLruGetBankLock(CsnlogCtl, pageno);
+	LWLockAcquire(lock, LW_EXCLUSIVE);
+
+	slotno = SimpleLruReadPage(CsnlogCtl, pageno, true, xid);
+
+	/* Subtransactions first, if needed ... */
+	for (i = 0; i < nsubxids; i++)
+	{
+		Assert(CsnlogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+		CSNLogSetCSNInSlot(subxids[i], commitLsn, slotno);
+	}
+
+	/* ... then the main transaction */
+	if (TransactionIdIsValid(xid))
+		CSNLogSetCSNInSlot(xid, commitLsn, slotno);
+
+	CsnlogCtl->shared->page_dirty[slotno] = true;
+
+	LWLockRelease(lock);
+}
+
+/*
+ * Sets the commit status of a single transaction.
+ */
+static void
+CSNLogSetCSNInSlot(TransactionId xid, XLogRecPtr csn, int slotno)
+{
+	int			entryno = TransactionIdToPgIndex(xid);
+	XLogRecPtr *ptr;
+
+	ptr = (XLogRecPtr *) (CsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr));
+
+	*ptr = csn;
+}
+
+/*
+ * Interrogate the state of a transaction in the log.
+ *
+ * NB: this is a low-level routine and is NOT the preferred entry point
+ * for most uses; TransactionIdGetXidCSN() in csn_snapshot.c is the
+ * intended caller.
+ */
+XLogRecPtr
+CSNLogGetCSNByXid(TransactionId xid)
+{
+	int			pageno = TransactionIdToPage(xid);
+	int			entryno = TransactionIdToPgIndex(xid);
+	int			slotno;
+	XLogRecPtr *ptr;
+	XLogRecPtr	xid_csn;
+
+	Assert(TransactionIdIsValid(xid));
+
+	/* Can't ask about stuff that might not be around anymore */
+	Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin));
+
+	/* lock is acquired by SimpleLruReadPage_ReadOnly */
+
+	slotno = SimpleLruReadPage_ReadOnly(CsnlogCtl, pageno, xid);
+	ptr = (XLogRecPtr *) (CsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr));
+	xid_csn = *ptr;
+
+	LWLockRelease(SimpleLruGetBankLock(CsnlogCtl, pageno));
+
+	return xid_csn;
+}
+
+/*
+ * Number of shared CSNLog buffers.
+ */
+static Size
+CSNLogShmemBuffers(void)
+{
+	return Min(32, Max(16, NBuffers / 512));
+}
+
+/*
+ * Reserve shared memory for CsnlogCtl.
+ */
+Size
+CSNLogShmemSize(void)
+{
+	// FIXME: skip if not InHotStandby?
+	return SimpleLruShmemSize(CSNLogShmemBuffers(), 0);
+}
+
+/*
+ * Initialization of shared memory for CSNLog.
+ */
+void
+CSNLogShmemInit(void)
+{
+	CsnlogCtl->PagePrecedes = CSNLogPagePrecedes;
+	SimpleLruInit(CsnlogCtl, "CSNLog Ctl", CSNLogShmemBuffers(), 0,
+				  "pg_csn", LWTRANCHE_CSN_LOG_BUFFER,
+				  LWTRANCHE_CSN_LOG_SLRU, SYNC_HANDLER_NONE, false);
+	//SlruPagePrecedesUnitTests(CsnlogCtl, SUBTRANS_XACTS_PER_PAGE);
+}
+
+/*
+ * This func must be called ONCE on system install.  It creates the initial
+ * CSNLog segment.  The pg_csn directory is assumed to have been
+ * created by initdb, and CSNLogShmemInit must have been called already.
+ *
+ * Note: it's not really necessary to create the initial segment now,
+ * since slru.c would create it on first write anyway.  But we may as well
+ * do it to be sure the directory is set up correctly.
+ */
+void
+BootStrapCSNLog(void)
+{
+	int			slotno;
+	LWLock	   *lock;
+
+	lock = SimpleLruGetBankLock(CsnlogCtl, 0);
+	LWLockAcquire(lock, LW_EXCLUSIVE);
+
+	/* Create and zero the first page of the commit log */
+	slotno = ZeroCSNLogPage(0);
+
+	/* Make sure it's written out */
+	SimpleLruWritePage(CsnlogCtl, slotno);
+	Assert(!CsnlogCtl->shared->page_dirty[slotno]);
+
+	LWLockRelease(lock);
+}
+
+/*
+ * Initialize (or reinitialize) a page of CSNLog to zeroes.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroCSNLogPage(int pageno)
+{
+	return SimpleLruZeroPage(CsnlogCtl, pageno);
+}
+
+/*
+ * Initialize a page of CSNLog based on pg_xact.
+ *
+ * All committed transactions are stamped with 'csn'
+ */
+static void
+InitCSNLogPage(int pageno, TransactionId *xid, TransactionId nextXid, XLogRecPtr csn)
+{
+	XLogRecPtr	dummy;
+	int			slotno;
+
+	slotno = ZeroCSNLogPage(pageno);
+
+	while (*xid < nextXid && TransactionIdToPage(*xid) == pageno)
+	{
+		XidStatus	status = TransactionIdGetStatus(*xid, &dummy);
+
+		if (status == TRANSACTION_STATUS_COMMITTED ||
+			status == TRANSACTION_STATUS_ABORTED)
+			CSNLogSetCSNInSlot(*xid, csn, slotno);
+
+		TransactionIdAdvance(*xid);
+	}
+	SimpleLruZeroPage(CsnlogCtl, pageno);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid, and after
+ * initializing the CLOG.
+ *
+ * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
+ * if there are none.
+ *
+ * All transactions that have already completed are marked with 'csn'. ('csn'
+ * is supposed to be an "older than anything we'll ever need to compare with")
+ */
+void
+StartupCSNLog(TransactionId oldestActiveXID, XLogRecPtr csn)
+{
+	TransactionId xid;
+	FullTransactionId nextXid;
+	int			startPage;
+	int			endPage;
+	LWLock	   *prevlock = NULL;
+	LWLock	   *lock;
+
+	/*
+	 * Since we don't expect pg_csn to be valid across crashes, we initialize
+	 * the currently-active page(s) to zeroes during startup. Whenever we
+	 * advance into a new page, ExtendCSNLog will likewise zero the new page
+	 * without regard to whatever was previously on disk.
+	 */
+	startPage = TransactionIdToPage(oldestActiveXID);
+	nextXid = TransamVariables->nextXid;
+	endPage = TransactionIdToPage(XidFromFullTransactionId(nextXid));
+
+	Assert(TransactionIdIsValid(oldestActiveXID));
+	Assert(FullTransactionIdIsValid(nextXid));
+
+	xid = oldestActiveXID;
+	for (;;)
+	{
+		lock = SimpleLruGetBankLock(CsnlogCtl, startPage);
+		if (prevlock != lock)
+		{
+			if (prevlock)
+				LWLockRelease(prevlock);
+			LWLockAcquire(lock, LW_EXCLUSIVE);
+			prevlock = lock;
+		}
+
+		InitCSNLogPage(startPage, &xid, XidFromFullTransactionId(nextXid), csn);
+		if (startPage == endPage)
+			break;
+
+		startPage++;
+		/* must account for wraparound */
+		if (startPage > TransactionIdToPage(MaxTransactionId))
+			startPage = 0;
+	}
+
+	LWLockRelease(lock);
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend shutdown
+ */
+void
+ShutdownCSNLog(void)
+{
+	/*
+	 * Flush dirty CSNLog pages to disk.
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely as a debugging aid.
+	 */
+	TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_START(false);
+	SimpleLruWriteAll(CsnlogCtl, false);
+	TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_DONE(false);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointCSNLog(void)
+{
+	/*
+	 * Flush dirty CSNLog pages to disk.
+	 *
+	 * This is not actually necessary from a correctness point of view. We do
+	 * it merely to improve the odds that writing of dirty pages is done by
+	 * the checkpoint process and not by backends.
+	 */
+	TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_START(true);
+	SimpleLruWriteAll(CsnlogCtl, true);
+	TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_DONE(true);
+}
+
+/*
+ * Make sure that CSNLog has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock.  We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty clog or xlog page to make room
+ * in shared memory.
+ */
+void
+ExtendCSNLog(TransactionId newestXact)
+{
+	int64		pageno;
+	LWLock	   *lock;
+
+	/*
+	 * No work except at first XID of a page.  But beware: just after
+	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
+	 */
+	if (TransactionIdToPgIndex(newestXact) != 0 &&
+		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
+		return;
+
+	pageno = TransactionIdToPage(newestXact);
+
+	lock = SimpleLruGetBankLock(CsnlogCtl, pageno);
+
+	LWLockAcquire(lock, LW_EXCLUSIVE);
+
+	/* Zero the page and make an XLOG entry about it */
+	ZeroCSNLogPage(pageno);
+
+	LWLockRelease(lock);
+}
+
+/*
+ * Remove all CSNLog segments before the one holding the passed
+ * transaction ID.
+ *
+ * This is normally called during checkpoint, with oldestXact being the
+ * oldest TransactionXmin of any running transaction.
+ */
+void
+TruncateCSNLog(TransactionId oldestXact)
+{
+	int			cutoffPage;
+
+	/*
+	 * The cutoff point is the start of the segment containing oldestXact. We
+	 * pass the *page* containing oldestXact to SimpleLruTruncate. We step
+	 * back one transaction to avoid passing a cutoff page that hasn't been
+	 * created yet in the rare case that oldestXact would be the first item on
+	 * a page and oldestXact == next XID.  In that case, if we didn't subtract
+	 * one, we'd trigger SimpleLruTruncate's wraparound detection.
+	 */
+	TransactionIdRetreat(oldestXact);
+	cutoffPage = TransactionIdToPage(oldestXact);
+
+	SimpleLruTruncate(CsnlogCtl, cutoffPage);
+}
+
+/*
+ * Decide which of two CSNLog page numbers is "older" for truncation
+ * purposes.
+ *
+ * We need to use comparison of TransactionIds here in order to do the right
+ * thing with wraparound XID arithmetic.  However, if we are asked about
+ * page number zero, we don't want to hand InvalidTransactionId to
+ * TransactionIdPrecedes: it'll get weird about permanent xact IDs.  So,
+ * offset both xids by FirstNormalTransactionId to avoid that.
+ */
+static bool
+CSNLogPagePrecedes(int64 page1, int64 page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * CSN_LOG_XACTS_PER_PAGE;
+	xid1 += FirstNormalTransactionId;
+	xid2 = ((TransactionId) page2) * CSN_LOG_XACTS_PER_PAGE;
+	xid2 += FirstNormalTransactionId;
+
+	return TransactionIdPrecedes(xid1, xid2);
+}
diff --git a/src/backend/access/transam/meson.build b/src/backend/access/transam/meson.build
index 8a3522557c..cf41df2971 100644
--- a/src/backend/access/transam/meson.build
+++ b/src/backend/access/transam/meson.build
@@ -2,6 +2,7 @@
 
 backend_sources += files(
   'clog.c',
+  'csn_log.c',
   'commit_ts.c',
   'generic_xlog.c',
   'multixact.c',
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c
index 75b5325df8..93c4d495e4 100644
--- a/src/backend/access/transam/transam.c
+++ b/src/backend/access/transam/transam.c
@@ -377,6 +377,9 @@ TransactionIdLatest(TransactionId mainxid,
  * Also, because we group transactions on the same clog page to conserve
  * storage, we might return the LSN of a later transaction that falls into
  * the same group.
+ *
+ * XXX: Now that we have the CSN-log, should we use that during recovery? Or
+ * rename this function to reduce confusion.
  */
 XLogRecPtr
 TransactionIdGetCommitLSN(TransactionId xid)
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 49be1df91c..8729ce2054 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -77,6 +77,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/htup_details.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
@@ -1959,20 +1960,13 @@ restoreTwoPhaseData(void)
  * Our other responsibility is to determine and return the oldest valid XID
  * among the prepared xacts (if none, return TransamVariables->nextXid).
  * This is needed to synchronize pg_subtrans startup properly.
- *
- * If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all
- * top-level xids is stored in *xids_p. The number of entries in the array
- * is returned in *nxids_p.
  */
 TransactionId
-PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
+PrescanPreparedTransactions(void)
 {
 	FullTransactionId nextXid = TransamVariables->nextXid;
 	TransactionId origNextXid = XidFromFullTransactionId(nextXid);
 	TransactionId result = origNextXid;
-	TransactionId *xids = NULL;
-	int			nxids = 0;
-	int			allocsize = 0;
 	int			i;
 
 	LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
@@ -2000,34 +1994,10 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
 		if (TransactionIdPrecedes(xid, result))
 			result = xid;
 
-		if (xids_p)
-		{
-			if (nxids == allocsize)
-			{
-				if (nxids == 0)
-				{
-					allocsize = 10;
-					xids = palloc(allocsize * sizeof(TransactionId));
-				}
-				else
-				{
-					allocsize = allocsize * 2;
-					xids = repalloc(xids, allocsize * sizeof(TransactionId));
-				}
-			}
-			xids[nxids++] = xid;
-		}
-
 		pfree(buf);
 	}
 	LWLockRelease(TwoPhaseStateLock);
 
-	if (xids_p)
-	{
-		*xids_p = xids;
-		*nxids_p = nxids;
-	}
-
 	return result;
 }
 
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index cfe8c6cf8d..b074423654 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -15,6 +15,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 1eccb78ddc..cab9edc48b 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -21,6 +21,7 @@
 #include <unistd.h>
 
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/multixact.h"
 #include "access/parallel.h"
 #include "access/subtrans.h"
@@ -209,7 +210,6 @@ typedef struct TransactionStateData
 	int			prevSecContext; /* previous SecurityRestrictionContext */
 	bool		prevXactReadOnly;	/* entry-time xact r/o state */
 	bool		startedInRecovery;	/* did we start in recovery? */
-	bool		didLogXid;		/* has xid been included in WAL record? */
 	int			parallelModeLevel;	/* Enter/ExitParallelMode counter */
 	bool		parallelChildXact;	/* is any parent transaction parallel? */
 	bool		chain;			/* start a new block after this one */
@@ -249,13 +249,6 @@ static TransactionStateData TopTransactionStateData = {
 	.topXidLogged = false,
 };
 
-/*
- * unreportedXids holds XIDs of all subtransactions that have not yet been
- * reported in an XLOG_XACT_ASSIGNMENT record.
- */
-static int	nUnreportedXids;
-static TransactionId unreportedXids[PGPROC_MAX_CACHED_SUBXIDS];
-
 static TransactionState CurrentTransactionState = &TopTransactionStateData;
 
 /*
@@ -531,18 +524,6 @@ GetCurrentFullTransactionIdIfAny(void)
 	return CurrentTransactionState->fullTransactionId;
 }
 
-/*
- *	MarkCurrentTransactionIdLoggedIfAny
- *
- * Remember that the current xid - if it is assigned - now has been wal logged.
- */
-void
-MarkCurrentTransactionIdLoggedIfAny(void)
-{
-	if (FullTransactionIdIsValid(CurrentTransactionState->fullTransactionId))
-		CurrentTransactionState->didLogXid = true;
-}
-
 /*
  * IsSubxactTopXidLogPending
  *
@@ -635,7 +616,6 @@ AssignTransactionId(TransactionState s)
 {
 	bool		isSubXact = (s->parent != NULL);
 	ResourceOwner currentOwner;
-	bool		log_unknown_top = false;
 
 	/* Assert that caller didn't screw up */
 	Assert(!FullTransactionIdIsValid(s->fullTransactionId));
@@ -679,20 +659,6 @@ AssignTransactionId(TransactionState s)
 		pfree(parents);
 	}
 
-	/*
-	 * When wal_level=logical, guarantee that a subtransaction's xid can only
-	 * be seen in the WAL stream if its toplevel xid has been logged before.
-	 * If necessary we log an xact_assignment record with fewer than
-	 * PGPROC_MAX_CACHED_SUBXIDS. Note that it is fine if didLogXid isn't set
-	 * for a transaction even though it appears in a WAL record, we just might
-	 * superfluously log something. That can happen when an xid is included
-	 * somewhere inside a wal record, but not in XLogRecord->xl_xid, like in
-	 * xl_standby_locks.
-	 */
-	if (isSubXact && XLogLogicalInfoActive() &&
-		!TopTransactionStateData.didLogXid)
-		log_unknown_top = true;
-
 	/*
 	 * Generate a new FullTransactionId and record its xid in PGPROC and
 	 * pg_subtrans.
@@ -728,59 +694,6 @@ AssignTransactionId(TransactionState s)
 	XactLockTableInsert(XidFromFullTransactionId(s->fullTransactionId));
 
 	CurrentResourceOwner = currentOwner;
-
-	/*
-	 * Every PGPROC_MAX_CACHED_SUBXIDS assigned transaction ids within each
-	 * top-level transaction we issue a WAL record for the assignment. We
-	 * include the top-level xid and all the subxids that have not yet been
-	 * reported using XLOG_XACT_ASSIGNMENT records.
-	 *
-	 * This is required to limit the amount of shared memory required in a hot
-	 * standby server to keep track of in-progress XIDs. See notes for
-	 * RecordKnownAssignedTransactionIds().
-	 *
-	 * We don't keep track of the immediate parent of each subxid, only the
-	 * top-level transaction that each subxact belongs to. This is correct in
-	 * recovery only because aborted subtransactions are separately WAL
-	 * logged.
-	 *
-	 * This is correct even for the case where several levels above us didn't
-	 * have an xid assigned as we recursed up to them beforehand.
-	 */
-	if (isSubXact && XLogStandbyInfoActive())
-	{
-		unreportedXids[nUnreportedXids] = XidFromFullTransactionId(s->fullTransactionId);
-		nUnreportedXids++;
-
-		/*
-		 * ensure this test matches similar one in
-		 * RecoverPreparedTransactions()
-		 */
-		if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS ||
-			log_unknown_top)
-		{
-			xl_xact_assignment xlrec;
-
-			/*
-			 * xtop is always set by now because we recurse up transaction
-			 * stack to the highest unassigned xid and then come back down
-			 */
-			xlrec.xtop = GetTopTransactionId();
-			Assert(TransactionIdIsValid(xlrec.xtop));
-			xlrec.nsubxacts = nUnreportedXids;
-
-			XLogBeginInsert();
-			XLogRegisterData((char *) &xlrec, MinSizeOfXactAssignment);
-			XLogRegisterData((char *) unreportedXids,
-							 nUnreportedXids * sizeof(TransactionId));
-
-			(void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT);
-
-			nUnreportedXids = 0;
-			/* mark top, not current xact as having been logged */
-			TopTransactionStateData.didLogXid = true;
-		}
-	}
 }
 
 /*
@@ -1480,11 +1393,11 @@ RecordTransactionCommit(void)
 	 * temp tables will be lost anyway, unlogged tables will be truncated and
 	 * HOT pruning will be done again later. (Given the foregoing, you might
 	 * think that it would be unnecessary to emit the XLOG record at all in
-	 * this case, but we don't currently try to do that.  It would certainly
-	 * cause problems at least in Hot Standby mode, where the
-	 * KnownAssignedXids machinery requires tracking every XID assignment.  It
-	 * might be OK to skip it only when wal_level < replica, but for now we
-	 * don't.)
+	 * this case, but we don't currently try to do that.  It might cause
+	 * inefficiencies in Hot Standby mode, if nothing else, where the
+	 * commit/abort records allow advancing the xmin horizon for new
+	 * snapshots. It might be OK to skip it only when wal_level < replica, but
+	 * for now we don't.)
 	 *
 	 * However, if we're doing cleanup of any non-temp rels or committing any
 	 * command that wanted to force sync commit, then we must flush XLOG
@@ -1952,13 +1865,6 @@ AtSubAbort_childXids(void)
 	s->childXids = NULL;
 	s->nChildXids = 0;
 	s->maxChildXids = 0;
-
-	/*
-	 * We could prune the unreportedXids array here. But we don't bother. That
-	 * would potentially reduce number of XLOG_XACT_ASSIGNMENT records but it
-	 * would likely introduce more CPU time into the more common paths, so we
-	 * choose not to do that.
-	 */
 }
 
 /* ----------------------------------------------------------------
@@ -2141,12 +2047,6 @@ StartTransaction(void)
 	currentCommandId = FirstCommandId;
 	currentCommandIdUsed = false;
 
-	/*
-	 * initialize reported xid accounting
-	 */
-	nUnreportedXids = 0;
-	s->didLogXid = false;
-
 	/*
 	 * must initialize resource-management stuff first
 	 */
@@ -6142,7 +6042,7 @@ xact_redo_commit(xl_xact_parsed_commit *parsed,
 	TransactionTreeSetCommitTsData(xid, parsed->nsubxacts, parsed->subxacts,
 								   commit_time, origin_id);
 
-	if (standbyState == STANDBY_DISABLED)
+	if (!InHotStandby)
 	{
 		/*
 		 * Mark the transaction committed in pg_xact.
@@ -6162,6 +6062,12 @@ xact_redo_commit(xl_xact_parsed_commit *parsed,
 		 */
 		RecordKnownAssignedTransactionIds(max_xid);
 
+		/*
+		 * Mark the CSNLOG first.  The transaction won't become visible to new
+		 * snapshots until the call to ProcArrayRecoveryEndTransaction().
+		 */
+		CSNLogSetCSN(xid, parsed->nsubxacts, parsed->subxacts, lsn);
+
 		/*
 		 * Mark the transaction committed in pg_xact. We use async commit
 		 * protocol during recovery to provide information on database
@@ -6174,9 +6080,9 @@ xact_redo_commit(xl_xact_parsed_commit *parsed,
 		TransactionIdAsyncCommitTree(xid, parsed->nsubxacts, parsed->subxacts, lsn);
 
 		/*
-		 * We must mark clog before we update the ProcArray.
+		 * Make the commit visible to new snapshots in the ProcArray.
 		 */
-		ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid);
+		ProcArrayRecoveryEndTransaction(max_xid, lsn);
 
 		/*
 		 * Send any cache invalidations attached to the commit. We must
@@ -6282,7 +6188,7 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid,
 								  parsed->subxacts);
 	AdvanceNextFullTransactionIdPastXid(max_xid);
 
-	if (standbyState == STANDBY_DISABLED)
+	if (!InHotStandby)
 	{
 		/* Mark the transaction aborted in pg_xact, no need for async stuff */
 		TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts);
@@ -6300,13 +6206,15 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid,
 		 */
 		RecordKnownAssignedTransactionIds(max_xid);
 
+		/* Note: we don't need to update the CSN log on abort. */
+
 		/* Mark the transaction aborted in pg_xact, no need for async stuff */
 		TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts);
 
 		/*
 		 * We must update the ProcArray after we have marked clog.
 		 */
-		ExpireTreeKnownAssignedTransactionIds(xid, parsed->nsubxacts, parsed->subxacts, max_xid);
+		ProcArrayRecoveryEndTransaction(max_xid, lsn);
 
 		/*
 		 * There are no invalidation messages to send or undo.
@@ -6414,14 +6322,6 @@ xact_redo(XLogReaderState *record)
 					   XLogRecGetOrigin(record));
 		LWLockRelease(TwoPhaseStateLock);
 	}
-	else if (info == XLOG_XACT_ASSIGNMENT)
-	{
-		xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record);
-
-		if (standbyState >= STANDBY_INITIALIZED)
-			ProcArrayApplyXidAssignment(xlrec->xtop,
-										xlrec->nsubxacts, xlrec->xsub);
-	}
 	else if (info == XLOG_XACT_INVALIDATIONS)
 	{
 		/*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6f58412bca..a3ba04fbc8 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -48,6 +48,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/heaptoast.h"
 #include "access/multixact.h"
 #include "access/rewriteheap.h"
@@ -951,8 +952,6 @@ XLogInsertRecord(XLogRecData *rdata,
 
 	END_CRIT_SECTION();
 
-	MarkCurrentTransactionIdLoggedIfAny();
-
 	/*
 	 * Mark top transaction id is logged (if needed) so that we should not try
 	 * to log it again with the next WAL record in the current subtransaction.
@@ -5182,6 +5181,7 @@ BootStrapXLOG(uint32 data_checksum_version)
 
 	/* Bootstrap the commit log, too */
 	BootStrapCLOG();
+	BootStrapCSNLog();
 	BootStrapCommitTs();
 	BootStrapSUBTRANS();
 	BootStrapMultiXact();
@@ -5783,16 +5783,16 @@ StartupXLOG(void)
 		 */
 		if (ArchiveRecoveryRequested && EnableHotStandby)
 		{
-			TransactionId *xids;
-			int			nxids;
+			FullTransactionId latestCompletedXid;
 
 			ereport(DEBUG1,
 					(errmsg_internal("initializing for hot standby")));
+			InHotStandby = true;
 
 			InitRecoveryTransactionEnvironment();
 
 			if (wasShutdown)
-				oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+				oldestActiveXID = PrescanPreparedTransactions();
 			else
 				oldestActiveXID = checkPoint.oldestActiveXid;
 			Assert(TransactionIdIsValid(oldestActiveXID));
@@ -5807,39 +5807,17 @@ StartupXLOG(void)
 			 */
 			StartupSUBTRANS(oldestActiveXID);
 
-			/*
-			 * If we're beginning at a shutdown checkpoint, we know that
-			 * nothing was running on the primary at this point. So fake-up an
-			 * empty running-xacts record and use that here and now. Recover
-			 * additional standby state for prepared transactions.
-			 */
-			if (wasShutdown)
-			{
-				RunningTransactionsData running;
-				TransactionId latestCompletedXid;
+			latestCompletedXid = checkPoint.nextXid;
+			FullTransactionIdRetreat(&latestCompletedXid);
+			TransamVariables->latestCompletedXid = latestCompletedXid;
 
-				/* Update pg_subtrans entries for any prepared transactions */
-				StandbyRecoverPreparedTransactions();
+			StartupCSNLog(oldestActiveXID, RedoRecPtr);
 
-				/*
-				 * Construct a RunningTransactions snapshot representing a
-				 * shut down server, with only prepared transactions still
-				 * alive. We're never overflowed at this point because all
-				 * subxids are listed with their parent prepared transactions.
-				 */
-				running.xcnt = nxids;
-				running.subxcnt = 0;
-				running.subxid_status = SUBXIDS_IN_SUBTRANS;
-				running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
-				running.oldestRunningXid = oldestActiveXID;
-				latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
-				TransactionIdRetreat(latestCompletedXid);
-				Assert(TransactionIdIsNormal(latestCompletedXid));
-				running.latestCompletedXid = latestCompletedXid;
-				running.xids = xids;
-
-				ProcArrayApplyRecoveryInfo(&running);
-			}
+			ProcArrayUpdateOldestRunningXid(oldestActiveXID);
+
+			/* Update pg_subtrans entries for any prepared transactions */
+			if (wasShutdown)
+				StandbyRecoverPreparedTransactions();
 		}
 
 		/*
@@ -5923,7 +5901,7 @@ StartupXLOG(void)
 	 * This information is not quite needed yet, but it is positioned here so
 	 * as potential problems are detected before any on-disk change is done.
 	 */
-	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
+	oldestActiveXID = PrescanPreparedTransactions();
 
 	/*
 	 * Allow ordinary WAL segment creation before possibly switching to a new
@@ -6089,9 +6067,18 @@ StartupXLOG(void)
 	 * Start up subtrans, if not already done for hot standby.  (commit
 	 * timestamps are started below, if necessary.)
 	 */
-	if (standbyState == STANDBY_DISABLED)
+	if (!InHotStandby)
+	{
 		StartupSUBTRANS(oldestActiveXID);
 
+		/*
+		 * TODO: we don't need to update CSN log from now on, but it's still
+		 * required by snapshots that were taken before recovery ended.  We
+		 * just let it be, but it would be nice to truncate it to 0 after all
+		 * the snapshots are gone.
+		 */
+	}
+
 	/*
 	 * Perform end of recovery actions for any SLRUs that need it.
 	 */
@@ -6177,12 +6164,12 @@ StartupXLOG(void)
 	 * Shutdown the recovery environment.  This must occur after
 	 * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
 	 * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
-	 * any session building a snapshot will not rely on KnownAssignedXids as
+	 * any session building a snapshot will not rely on the CSN log as
 	 * RecoveryInProgress() would return false at this stage.  This is
 	 * particularly critical for prepared 2PC transactions, that would still
 	 * need to be included in snapshots once recovery has ended.
 	 */
-	if (standbyState != STANDBY_DISABLED)
+	if (InHotStandby)
 		ShutdownRecoveryTransactionEnvironment();
 
 	/*
@@ -6954,7 +6941,7 @@ CreateCheckPoint(int flags)
 	 * starting snapshot of locks and transactions.
 	 */
 	if (!shutdown && XLogStandbyInfoActive())
-		checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
+		checkPoint.oldestActiveXid = GetOldestActiveTransactionId(true);
 	else
 		checkPoint.oldestActiveXid = InvalidTransactionId;
 
@@ -7346,7 +7333,10 @@ CreateCheckPoint(int flags)
 	 * StartupSUBTRANS hasn't been called yet.
 	 */
 	if (!RecoveryInProgress())
+	{
 		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
+		TruncateCSNLog(GetOldestTransactionIdConsideredRunning());
+	}
 
 	/* Real work is done; log and update stats. */
 	LogCheckpointEnd(false);
@@ -7519,6 +7509,7 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
 	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
 	CheckPointCLOG();
+	CheckPointCSNLog();
 	CheckPointCommitTs();
 	CheckPointSUBTRANS();
 	CheckPointMultiXact();
@@ -7815,7 +7806,10 @@ CreateRestartPoint(int flags)
 	 * this because StartupSUBTRANS hasn't been called yet.
 	 */
 	if (EnableHotStandby)
+	{
 		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
+		TruncateCSNLog(GetOldestTransactionIdConsideredRunning());
+	}
 
 	/* Real work is done; log and update stats. */
 	LogCheckpointEnd(true);
@@ -8300,41 +8294,17 @@ xlog_redo(XLogReaderState *record)
 
 		/*
 		 * If we see a shutdown checkpoint, we know that nothing was running
-		 * on the primary at this point. So fake-up an empty running-xacts
-		 * record and use that here and now. Recover additional standby state
-		 * for prepared transactions.
+		 * on the primary at this point, except for prepared transactions.
 		 */
-		if (standbyState >= STANDBY_INITIALIZED)
+		if (InHotStandby)
 		{
-			TransactionId *xids;
-			int			nxids;
 			TransactionId oldestActiveXID;
-			TransactionId latestCompletedXid;
-			RunningTransactionsData running;
 
-			oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+			oldestActiveXID = PrescanPreparedTransactions();
+			ProcArrayUpdateOldestRunningXid(oldestActiveXID);
 
 			/* Update pg_subtrans entries for any prepared transactions */
 			StandbyRecoverPreparedTransactions();
-
-			/*
-			 * Construct a RunningTransactions snapshot representing a shut
-			 * down server, with only prepared transactions still alive. We're
-			 * never overflowed at this point because all subxids are listed
-			 * with their parent prepared transactions.
-			 */
-			running.xcnt = nxids;
-			running.subxcnt = 0;
-			running.subxid_status = SUBXIDS_IN_SUBTRANS;
-			running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
-			running.oldestRunningXid = oldestActiveXID;
-			latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
-			TransactionIdRetreat(latestCompletedXid);
-			Assert(TransactionIdIsNormal(latestCompletedXid));
-			running.latestCompletedXid = latestCompletedXid;
-			running.xids = xids;
-
-			ProcArrayApplyRecoveryInfo(&running);
 		}
 
 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
@@ -8398,6 +8368,16 @@ xlog_redo(XLogReaderState *record)
 								  checkPoint.oldestXid))
 			SetTransactionIdLimit(checkPoint.oldestXid,
 								  checkPoint.oldestXidDB);
+
+		/*
+		 * Remember the oldest XID that was running at the time.  Normally,
+		 * all transaction aborts and commits are WAL-logged, so our
+		 * oldestRunningXid value should be up-to-date, but if not, this
+		 * allows us to resynchronize.
+		 */
+		if (InHotStandby)
+			ProcArrayUpdateOldestRunningXid(checkPoint.oldestActiveXid);
+
 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index c6994b7828..7b2475e4e2 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -1975,10 +1975,9 @@ ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *repl
 	SpinLockRelease(&XLogRecoveryCtl->info_lck);
 
 	/*
-	 * If we are attempting to enter Hot Standby mode, process XIDs we see
+	 * In Hot Standby mode, process XIDs we see
 	 */
-	if (standbyState >= STANDBY_INITIALIZED &&
-		TransactionIdIsValid(record->xl_xid))
+	if (InHotStandby && TransactionIdIsValid(record->xl_xid))
 		RecordKnownAssignedTransactionIds(record->xl_xid);
 
 	/*
@@ -2255,7 +2254,7 @@ CheckRecoveryConsistency(void)
 	 * run? If so, we can tell postmaster that the database is consistent now,
 	 * enabling connections.
 	 */
-	if (standbyState == STANDBY_SNAPSHOT_READY &&
+	if (InHotStandby &&
 		!LocalHotStandbyActive &&
 		reachedConsistency &&
 		IsUnderPostmaster)
@@ -3700,9 +3699,6 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 						elog(LOG, "waiting for WAL to become available at %X/%X",
 							 LSN_FORMAT_ARGS(RecPtr));
 
-						/* Do background tasks that might benefit us later. */
-						KnownAssignedTransactionIdsIdleMaintenance();
-
 						(void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
 										 WL_LATCH_SET | WL_TIMEOUT |
 										 WL_EXIT_ON_PM_DEATH,
@@ -3968,9 +3964,6 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 						streaming_reply_sent = true;
 					}
 
-					/* Do any background tasks that might benefit us later. */
-					KnownAssignedTransactionIdsIdleMaintenance();
-
 					/* Update pg_stat_recovery_prefetch before sleeping. */
 					XLogPrefetcherComputeStats(xlogprefetcher);
 
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 5295b85fe0..bf08c60e93 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -50,7 +50,7 @@ bool		ignore_invalid_pages = false;
 bool		InRecovery = false;
 
 /* Are we in Hot Standby mode? Only valid in startup process, see xlogutils.h */
-HotStandbyState standbyState = STANDBY_DISABLED;
+bool		InHotStandby = false;
 
 /*
  * During XLOG replay, we may see XLOG records for incremental updates of
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index ef6f98ebcd..a975865fdd 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -203,7 +203,7 @@ static void
 StartupProcExit(int code, Datum arg)
 {
 	/* Shutdown the recovery environment */
-	if (standbyState != STANDBY_DISABLED)
+	if (InHotStandby)
 		ShutdownRecoveryTransactionEnvironment();
 }
 
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index e73576ad12..c4f9feed64 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -270,14 +270,6 @@ xact_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 				DecodeAbort(ctx, buf, &parsed, xid, two_phase);
 				break;
 			}
-		case XLOG_XACT_ASSIGNMENT:
-
-			/*
-			 * We assign subxact to the toplevel xact while processing each
-			 * record if required.  So, we don't need to do anything here. See
-			 * LogicalDecodingProcessRecord.
-			 */
-			break;
 		case XLOG_XACT_INVALIDATIONS:
 			{
 				TransactionId xid;
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index a6a4da3266..734865ce62 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -27,7 +27,7 @@
  * removed. This is achieved by using the replication slot mechanism.
  *
  * As the percentage of transactions modifying the catalog normally is fairly
- * small in comparisons to ones only manipulating user data, we keep track of
+ * small in comparison to ones only manipulating user data, we keep track of
  * the committed catalog modifying ones inside [xmin, xmax) instead of keeping
  * track of all running transactions like it's done in a normal snapshot. Note
  * that we're generally only looking at transactions that have acquired an
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 7783ba854f..49c2ced2d4 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -16,6 +16,7 @@
 
 #include "access/clog.h"
 #include "access/commit_ts.h"
+#include "access/csn_log.h"
 #include "access/multixact.h"
 #include "access/nbtree.h"
 #include "access/subtrans.h"
@@ -121,6 +122,7 @@ CalculateShmemSize(int *num_semaphores)
 	size = add_size(size, XLOGShmemSize());
 	size = add_size(size, XLogRecoveryShmemSize());
 	size = add_size(size, CLOGShmemSize());
+	size = add_size(size, CSNLogShmemSize());
 	size = add_size(size, CommitTsShmemSize());
 	size = add_size(size, SUBTRANSShmemSize());
 	size = add_size(size, TwoPhaseShmemSize());
@@ -285,6 +287,7 @@ CreateOrAttachShmemStructs(void)
 	XLogPrefetchShmemInit();
 	XLogRecoveryShmemInit();
 	CLOGShmemInit();
+	CSNLogShmemInit();
 	CommitTsShmemInit();
 	SUBTRANSShmemInit();
 	MultiXactShmemInit();
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 36610a1c7e..c82e8d8c43 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -19,20 +19,10 @@
  * myProcLocks lists.  They can be distinguished from regular backend PGPROCs
  * at need by checking for pid == 0.
  *
- * During hot standby, we also keep a list of XIDs representing transactions
- * that are known to be running on the primary (or more precisely, were running
- * as of the current point in the WAL stream).  This list is kept in the
- * KnownAssignedXids array, and is updated by watching the sequence of
- * arriving XIDs.  This is necessary because if we leave those XIDs out of
- * snapshots taken for standby queries, then they will appear to be already
- * complete, leading to MVCC failures.  Note that in hot standby, the PGPROC
- * array represents standby processes, which by definition are not running
- * transactions that have XIDs.
- *
- * It is perhaps possible for a backend on the primary to terminate without
- * writing an abort record for its transaction.  While that shouldn't really
- * happen, it would tie up KnownAssignedXids indefinitely, so we protect
- * ourselves by pruning the array when a valid list of running XIDs arrives.
+ * During hot standby, we don't have PGPROC entries representing transactions
+ * running in the primary.  In snapshots taken during recovery, the snapshot
+ * contains a Commit-Sequence Number (CSN) which is used to determine which
+ * XIDs are still considered as running by the snapshot.
  *
  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -47,6 +37,7 @@
 
 #include <signal.h>
 
+#include "access/csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/twophase.h"
@@ -73,22 +64,8 @@ typedef struct ProcArrayStruct
 	int			numProcs;		/* number of valid procs entries */
 	int			maxProcs;		/* allocated size of procs array */
 
-	/*
-	 * Known assigned XIDs handling
-	 */
-	int			maxKnownAssignedXids;	/* allocated size of array */
-	int			numKnownAssignedXids;	/* current # of valid entries */
-	int			tailKnownAssignedXids;	/* index of oldest valid element */
-	int			headKnownAssignedXids;	/* index of newest element, + 1 */
-
-	/*
-	 * Highest subxid that has been removed from KnownAssignedXids array to
-	 * prevent overflow; or InvalidTransactionId if none.  We track this for
-	 * similar reasons to tracking overflowing cached subxids in PGPROC
-	 * entries.  Must hold exclusive ProcArrayLock to change this, and shared
-	 * lock to read it.
-	 */
-	TransactionId lastOverflowedXid;
+	/* In recovery, oldest XID that could be still running in primary */
+	TransactionId oldest_running_primary_xid;
 
 	/* oldest xmin of any replication slot */
 	TransactionId replication_slot_xmin;
@@ -99,6 +76,21 @@ typedef struct ProcArrayStruct
 	int			pgprocnos[FLEXIBLE_ARRAY_MEMBER];
 } ProcArrayStruct;
 
+#define PROCARRAY_MAXPROCS	(MaxBackends + max_prepared_xacts)
+
+/*
+ * TOTAL_MAX_CACHED_SUBXIDS is the total number of XIDs that fits in the proc
+ * array, as top XIDs and in the subxids caches.
+ *
+ * Local data structures are also created in various backends during
+ * GetSnapshotData(), TransactionIdIsInProgress() and
+ * GetRunningTransactionData(). All of the main structures created in those
+ * functions must be identically sized, since we may at times copy the whole
+ * of the data structures around.
+ */
+#define TOTAL_MAX_CACHED_SUBXIDS \
+	((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
+
 /*
  * State for the GlobalVisTest* family of functions. Those functions can
  * e.g. be used to decide if a deleted row can be removed without violating
@@ -254,17 +246,6 @@ typedef enum GlobalVisHorizonKind
 	VISHORIZON_TEMP,
 } GlobalVisHorizonKind;
 
-/*
- * Reason codes for KnownAssignedXidsCompress().
- */
-typedef enum KAXCompressReason
-{
-	KAX_NO_SPACE,				/* need to free up space at array end */
-	KAX_PRUNE,					/* we just pruned old entries */
-	KAX_TRANSACTION_END,		/* we just committed/removed some XIDs */
-	KAX_STARTUP_PROCESS_IDLE,	/* startup process is about to sleep */
-} KAXCompressReason;
-
 
 static ProcArrayStruct *procArray;
 
@@ -278,17 +259,8 @@ static TransactionId cachedXidIsNotInProgress = InvalidTransactionId;
 /*
  * Bookkeeping for tracking emulated transactions in recovery
  */
-static TransactionId *KnownAssignedXids;
-static bool *KnownAssignedXidsValid;
 static TransactionId latestObservedXid = InvalidTransactionId;
 
-/*
- * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
- * the highest xid that might still be running that we don't have in
- * KnownAssignedXids.
- */
-static TransactionId standbySnapshotPendingXmin;
-
 /*
  * State for visibility checks on different types of relations. See struct
  * GlobalVisState for details. As shared, catalog, normal and temporary
@@ -315,7 +287,7 @@ static long xc_by_my_xact = 0;
 static long xc_by_latest_xid = 0;
 static long xc_by_main_xid = 0;
 static long xc_by_child_xid = 0;
-static long xc_by_known_assigned = 0;
+static long xc_during_recovery = 0;
 static long xc_no_overflow = 0;
 static long xc_slow_answer = 0;
 
@@ -325,7 +297,7 @@ static long xc_slow_answer = 0;
 #define xc_by_latest_xid_inc()		(xc_by_latest_xid++)
 #define xc_by_main_xid_inc()		(xc_by_main_xid++)
 #define xc_by_child_xid_inc()		(xc_by_child_xid++)
-#define xc_by_known_assigned_inc()	(xc_by_known_assigned++)
+#define xc_during_recovery_inc()	(xc_during_recovery++)
 #define xc_no_overflow_inc()		(xc_no_overflow++)
 #define xc_slow_answer_inc()		(xc_slow_answer++)
 
@@ -338,28 +310,12 @@ static void DisplayXidCache(void);
 #define xc_by_latest_xid_inc()		((void) 0)
 #define xc_by_main_xid_inc()		((void) 0)
 #define xc_by_child_xid_inc()		((void) 0)
-#define xc_by_known_assigned_inc()	((void) 0)
+#define xc_during_recovery_inc()	((void) 0)
 #define xc_no_overflow_inc()		((void) 0)
 #define xc_slow_answer_inc()		((void) 0)
 #endif							/* XIDCACHE_DEBUG */
 
-/* Primitives for KnownAssignedXids array handling for standby */
-static void KnownAssignedXidsCompress(KAXCompressReason reason, bool haveLock);
-static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
-								 bool exclusive_lock);
-static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
-static bool KnownAssignedXidExists(TransactionId xid);
-static void KnownAssignedXidsRemove(TransactionId xid);
-static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
-										TransactionId *subxids);
-static void KnownAssignedXidsRemovePreceding(TransactionId removeXid);
-static int	KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
-static int	KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
-										   TransactionId *xmin,
-										   TransactionId xmax);
-static TransactionId KnownAssignedXidsGetOldestXmin(void);
-static void KnownAssignedXidsDisplay(int trace_level);
-static void KnownAssignedXidsReset(void);
+
 static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid);
 static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
 static void MaintainLatestCompletedXid(TransactionId latestXid);
@@ -383,31 +339,6 @@ ProcArrayShmemSize(void)
 	size = offsetof(ProcArrayStruct, pgprocnos);
 	size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS));
 
-	/*
-	 * During Hot Standby processing we have a data structure called
-	 * KnownAssignedXids, created in shared memory. Local data structures are
-	 * also created in various backends during GetSnapshotData(),
-	 * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
-	 * main structures created in those functions must be identically sized,
-	 * since we may at times copy the whole of the data structures around. We
-	 * refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
-	 *
-	 * Ideally we'd only create this structure if we were actually doing hot
-	 * standby in the current run, but we don't know that yet at the time
-	 * shared memory is being set up.
-	 */
-#define TOTAL_MAX_CACHED_SUBXIDS \
-	((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
-
-	if (EnableHotStandby)
-	{
-		size = add_size(size,
-						mul_size(sizeof(TransactionId),
-								 TOTAL_MAX_CACHED_SUBXIDS));
-		size = add_size(size,
-						mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
-	}
-
 	return size;
 }
 
@@ -434,31 +365,12 @@ ProcArrayShmemInit(void)
 		 */
 		procArray->numProcs = 0;
 		procArray->maxProcs = PROCARRAY_MAXPROCS;
-		procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
-		procArray->numKnownAssignedXids = 0;
-		procArray->tailKnownAssignedXids = 0;
-		procArray->headKnownAssignedXids = 0;
-		procArray->lastOverflowedXid = InvalidTransactionId;
 		procArray->replication_slot_xmin = InvalidTransactionId;
 		procArray->replication_slot_catalog_xmin = InvalidTransactionId;
 		TransamVariables->xactCompletionCount = 1;
 	}
 
 	allProcs = ProcGlobal->allProcs;
-
-	/* Create or attach to the KnownAssignedXids arrays too, if needed */
-	if (EnableHotStandby)
-	{
-		KnownAssignedXids = (TransactionId *)
-			ShmemInitStruct("KnownAssignedXids",
-							mul_size(sizeof(TransactionId),
-									 TOTAL_MAX_CACHED_SUBXIDS),
-							&found);
-		KnownAssignedXidsValid = (bool *)
-			ShmemInitStruct("KnownAssignedXidsValid",
-							mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
-							&found);
-	}
 }
 
 /*
@@ -1022,355 +934,35 @@ MaintainLatestCompletedXidRecovery(TransactionId latestXid)
 void
 ProcArrayInitRecovery(TransactionId initializedUptoXID)
 {
-	Assert(standbyState == STANDBY_INITIALIZED);
+	Assert(InHotStandby);
 	Assert(TransactionIdIsNormal(initializedUptoXID));
 
 	/*
-	 * we set latestObservedXid to the xid SUBTRANS has been initialized up
-	 * to, so we can extend it from that point onwards in
-	 * RecordKnownAssignedTransactionIds, and when we get consistent in
-	 * ProcArrayApplyRecoveryInfo().
+	 * we set latestObservedXid to the xid SUBTRANS and CSN log have been
+	 * initialized up to, so we can extend it from that point onwards whenever
+	 * we observe new XIDs.
 	 */
 	latestObservedXid = initializedUptoXID;
 	TransactionIdRetreat(latestObservedXid);
 }
 
 /*
- * ProcArrayApplyRecoveryInfo -- apply recovery info about xids
- *
- * Takes us through 3 states: Initialized, Pending and Ready.
- * Normal case is to go all the way to Ready straight away, though there
- * are atypical cases where we need to take it in steps.
- *
- * Use the data about running transactions on the primary to create the initial
- * state of KnownAssignedXids. We also use these records to regularly prune
- * KnownAssignedXids because we know it is possible that some transactions
- * with FATAL errors fail to write abort records, which could cause eventual
- * overflow.
- *
- * See comments for LogStandbySnapshot().
+ * Update oldest running XID. from a checkpoint record. This allows truncating
+ * SUBTRANS and the CSN log.
  */
 void
-ProcArrayApplyRecoveryInfo(RunningTransactions running)
+ProcArrayUpdateOldestRunningXid(TransactionId oldestRunningXID)
 {
-	TransactionId *xids;
-	TransactionId advanceNextXid;
-	int			nxids;
-	int			i;
-
-	Assert(standbyState >= STANDBY_INITIALIZED);
-	Assert(TransactionIdIsValid(running->nextXid));
-	Assert(TransactionIdIsValid(running->oldestRunningXid));
-	Assert(TransactionIdIsNormal(running->latestCompletedXid));
-
-	/*
-	 * Remove stale transactions, if any.
-	 */
-	ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid);
-
-	/*
-	 * Adjust TransamVariables->nextXid before StandbyReleaseOldLocks(),
-	 * because we will need it up to date for accessing two-phase transactions
-	 * in StandbyReleaseOldLocks().
-	 */
-	advanceNextXid = running->nextXid;
-	TransactionIdRetreat(advanceNextXid);
-	AdvanceNextFullTransactionIdPastXid(advanceNextXid);
-	Assert(FullTransactionIdIsValid(TransamVariables->nextXid));
-
 	/*
 	 * Remove stale locks, if any.
 	 */
-	StandbyReleaseOldLocks(running->oldestRunningXid);
-
-	/*
-	 * If our snapshot is already valid, nothing else to do...
-	 */
-	if (standbyState == STANDBY_SNAPSHOT_READY)
-		return;
-
-	/*
-	 * If our initial RunningTransactionsData had an overflowed snapshot then
-	 * we knew we were missing some subxids from our snapshot. If we continue
-	 * to see overflowed snapshots then we might never be able to start up, so
-	 * we make another test to see if our snapshot is now valid. We know that
-	 * the missing subxids are equal to or earlier than nextXid. After we
-	 * initialise we continue to apply changes during recovery, so once the
-	 * oldestRunningXid is later than the nextXid from the initial snapshot we
-	 * know that we no longer have missing information and can mark the
-	 * snapshot as valid.
-	 */
-	if (standbyState == STANDBY_SNAPSHOT_PENDING)
-	{
-		/*
-		 * If the snapshot isn't overflowed or if its empty we can reset our
-		 * pending state and use this snapshot instead.
-		 */
-		if (running->subxid_status != SUBXIDS_MISSING || running->xcnt == 0)
-		{
-			/*
-			 * If we have already collected known assigned xids, we need to
-			 * throw them away before we apply the recovery snapshot.
-			 */
-			KnownAssignedXidsReset();
-			standbyState = STANDBY_INITIALIZED;
-		}
-		else
-		{
-			if (TransactionIdPrecedes(standbySnapshotPendingXmin,
-									  running->oldestRunningXid))
-			{
-				standbyState = STANDBY_SNAPSHOT_READY;
-				elog(DEBUG1,
-					 "recovery snapshots are now enabled");
-			}
-			else
-				elog(DEBUG1,
-					 "recovery snapshot waiting for non-overflowed snapshot or "
-					 "until oldest active xid on standby is at least %u (now %u)",
-					 standbySnapshotPendingXmin,
-					 running->oldestRunningXid);
-			return;
-		}
-	}
-
-	Assert(standbyState == STANDBY_INITIALIZED);
-
-	/*
-	 * NB: this can be reached at least twice, so make sure new code can deal
-	 * with that.
-	 */
+	StandbyReleaseOldLocks(oldestRunningXID);
 
-	/*
-	 * Nobody else is running yet, but take locks anyhow
-	 */
 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
-	/*
-	 * KnownAssignedXids is sorted so we cannot just add the xids, we have to
-	 * sort them first.
-	 *
-	 * Some of the new xids are top-level xids and some are subtransactions.
-	 * We don't call SubTransSetParent because it doesn't matter yet. If we
-	 * aren't overflowed then all xids will fit in snapshot and so we don't
-	 * need subtrans. If we later overflow, an xid assignment record will add
-	 * xids to subtrans. If RunningTransactionsData is overflowed then we
-	 * don't have enough information to correctly update subtrans anyway.
-	 */
-
-	/*
-	 * Allocate a temporary array to avoid modifying the array passed as
-	 * argument.
-	 */
-	xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt));
-
-	/*
-	 * Add to the temp array any xids which have not already completed.
-	 */
-	nxids = 0;
-	for (i = 0; i < running->xcnt + running->subxcnt; i++)
-	{
-		TransactionId xid = running->xids[i];
-
-		/*
-		 * The running-xacts snapshot can contain xids that were still visible
-		 * in the procarray when the snapshot was taken, but were already
-		 * WAL-logged as completed. They're not running anymore, so ignore
-		 * them.
-		 */
-		if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
-			continue;
-
-		xids[nxids++] = xid;
-	}
-
-	if (nxids > 0)
-	{
-		if (procArray->numKnownAssignedXids != 0)
-		{
-			LWLockRelease(ProcArrayLock);
-			elog(ERROR, "KnownAssignedXids is not empty");
-		}
-
-		/*
-		 * Sort the array so that we can add them safely into
-		 * KnownAssignedXids.
-		 *
-		 * We have to sort them logically, because in KnownAssignedXidsAdd we
-		 * call TransactionIdFollowsOrEquals and so on. But we know these XIDs
-		 * come from RUNNING_XACTS, which means there are only normal XIDs
-		 * from the same epoch, so this is safe.
-		 */
-		qsort(xids, nxids, sizeof(TransactionId), xidLogicalComparator);
-
-		/*
-		 * Add the sorted snapshot into KnownAssignedXids.  The running-xacts
-		 * snapshot may include duplicated xids because of prepared
-		 * transactions, so ignore them.
-		 */
-		for (i = 0; i < nxids; i++)
-		{
-			if (i > 0 && TransactionIdEquals(xids[i - 1], xids[i]))
-			{
-				elog(DEBUG1,
-					 "found duplicated transaction %u for KnownAssignedXids insertion",
-					 xids[i]);
-				continue;
-			}
-			KnownAssignedXidsAdd(xids[i], xids[i], true);
-		}
-
-		KnownAssignedXidsDisplay(DEBUG3);
-	}
-
-	pfree(xids);
-
-	/*
-	 * latestObservedXid is at least set to the point where SUBTRANS was
-	 * started up to (cf. ProcArrayInitRecovery()) or to the biggest xid
-	 * RecordKnownAssignedTransactionIds() was called for.  Initialize
-	 * subtrans from thereon, up to nextXid - 1.
-	 *
-	 * We need to duplicate parts of RecordKnownAssignedTransactionId() here,
-	 * because we've just added xids to the known assigned xids machinery that
-	 * haven't gone through RecordKnownAssignedTransactionId().
-	 */
-	Assert(TransactionIdIsNormal(latestObservedXid));
-	TransactionIdAdvance(latestObservedXid);
-	while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
-	{
-		ExtendSUBTRANS(latestObservedXid);
-		TransactionIdAdvance(latestObservedXid);
-	}
-	TransactionIdRetreat(latestObservedXid);	/* = running->nextXid - 1 */
-
-	/* ----------
-	 * Now we've got the running xids we need to set the global values that
-	 * are used to track snapshots as they evolve further.
-	 *
-	 * - latestCompletedXid which will be the xmax for snapshots
-	 * - lastOverflowedXid which shows whether snapshots overflow
-	 * - nextXid
-	 *
-	 * If the snapshot overflowed, then we still initialise with what we know,
-	 * but the recovery snapshot isn't fully valid yet because we know there
-	 * are some subxids missing. We don't know the specific subxids that are
-	 * missing, so conservatively assume the last one is latestObservedXid.
-	 * ----------
-	 */
-	if (running->subxid_status == SUBXIDS_MISSING)
-	{
-		standbyState = STANDBY_SNAPSHOT_PENDING;
-
-		standbySnapshotPendingXmin = latestObservedXid;
-		procArray->lastOverflowedXid = latestObservedXid;
-	}
-	else
-	{
-		standbyState = STANDBY_SNAPSHOT_READY;
-
-		standbySnapshotPendingXmin = InvalidTransactionId;
-
-		/*
-		 * If the 'xids' array didn't include all subtransactions, we have to
-		 * mark any snapshots taken as overflowed.
-		 */
-		if (running->subxid_status == SUBXIDS_IN_SUBTRANS)
-			procArray->lastOverflowedXid = latestObservedXid;
-		else
-		{
-			Assert(running->subxid_status == SUBXIDS_IN_ARRAY);
-			procArray->lastOverflowedXid = InvalidTransactionId;
-		}
-	}
-
-	/*
-	 * If a transaction wrote a commit record in the gap between taking and
-	 * logging the snapshot then latestCompletedXid may already be higher than
-	 * the value from the snapshot, so check before we use the incoming value.
-	 * It also might not yet be set at all.
-	 */
-	MaintainLatestCompletedXidRecovery(running->latestCompletedXid);
-
-	/*
-	 * NB: No need to increment TransamVariables->xactCompletionCount here,
-	 * nobody can see it yet.
-	 */
-
+	procArray->oldest_running_primary_xid = oldestRunningXID;
 	LWLockRelease(ProcArrayLock);
-
-	KnownAssignedXidsDisplay(DEBUG3);
-	if (standbyState == STANDBY_SNAPSHOT_READY)
-		elog(DEBUG1, "recovery snapshots are now enabled");
-	else
-		elog(DEBUG1,
-			 "recovery snapshot waiting for non-overflowed snapshot or "
-			 "until oldest active xid on standby is at least %u (now %u)",
-			 standbySnapshotPendingXmin,
-			 running->oldestRunningXid);
 }
 
-/*
- * ProcArrayApplyXidAssignment
- *		Process an XLOG_XACT_ASSIGNMENT WAL record
- */
-void
-ProcArrayApplyXidAssignment(TransactionId topxid,
-							int nsubxids, TransactionId *subxids)
-{
-	TransactionId max_xid;
-	int			i;
-
-	Assert(standbyState >= STANDBY_INITIALIZED);
-
-	max_xid = TransactionIdLatest(topxid, nsubxids, subxids);
-
-	/*
-	 * Mark all the subtransactions as observed.
-	 *
-	 * NOTE: This will fail if the subxid contains too many previously
-	 * unobserved xids to fit into known-assigned-xids. That shouldn't happen
-	 * as the code stands, because xid-assignment records should never contain
-	 * more than PGPROC_MAX_CACHED_SUBXIDS entries.
-	 */
-	RecordKnownAssignedTransactionIds(max_xid);
-
-	/*
-	 * Notice that we update pg_subtrans with the top-level xid, rather than
-	 * the parent xid. This is a difference between normal processing and
-	 * recovery, yet is still correct in all cases. The reason is that
-	 * subtransaction commit is not marked in clog until commit processing, so
-	 * all aborted subtransactions have already been clearly marked in clog.
-	 * As a result we are able to refer directly to the top-level
-	 * transaction's state rather than skipping through all the intermediate
-	 * states in the subtransaction tree. This should be the first time we
-	 * have attempted to SubTransSetParent().
-	 */
-	for (i = 0; i < nsubxids; i++)
-		SubTransSetParent(subxids[i], topxid);
-
-	/* KnownAssignedXids isn't maintained yet, so we're done for now */
-	if (standbyState == STANDBY_INITIALIZED)
-		return;
-
-	/*
-	 * Uses same locking as transaction commit
-	 */
-	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
-	/*
-	 * Remove subxids from known-assigned-xacts.
-	 */
-	KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids);
-
-	/*
-	 * Advance lastOverflowedXid to be at least the last of these subxids.
-	 */
-	if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid))
-		procArray->lastOverflowedXid = max_xid;
-
-	LWLockRelease(ProcArrayLock);
-}
 
 /*
  * TransactionIdIsInProgress -- is given transaction running in some backend
@@ -1378,23 +970,24 @@ ProcArrayApplyXidAssignment(TransactionId topxid,
  * Aside from some shortcuts such as checking RecentXmin and our own Xid,
  * there are four possibilities for finding a running transaction:
  *
- * 1. The given Xid is a main transaction Id.  We will find this out cheaply
+ * 1. In Hot Standby mode, there are no transactions with XIDs active in the
+ * standby. Check pg_xact to see if the transaction is known to have committed
+ * or aborted, otherwise it's considered as running.
+ *
+ * 2. The given Xid is a main transaction Id.  We will find this out cheaply
  * by looking at ProcGlobal->xids.
  *
- * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
+ * 3. The given Xid is one of the cached subxact Xids in the PGPROC array.
  * We can find this out cheaply too.
  *
- * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
- * if the Xid is running on the primary.
- *
  * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
- * if that is running according to ProcGlobal->xids[] or KnownAssignedXids.
+ * if that is running according to ProcGlobal->xids[].
  * This is the slowest way, but sadly it has to be done always if the others
  * failed, unless we see that the cached subxact sets are complete (none have
  * overflowed).
  *
- * ProcArrayLock has to be held while we do 1, 2, 3.  If we save the top Xids
- * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
+ * ProcArrayLock has to be held while we do 2 and 3.  If we save the top Xids
+ * while doing 2 and 3, we can release the ProcArrayLock while we do 4.
  * This buys back some concurrency (and we can't retrieve the main Xids from
  * ProcGlobal->xids[] again anyway; see GetNewTransactionId).
  */
@@ -1435,6 +1028,28 @@ TransactionIdIsInProgress(TransactionId xid)
 		return false;
 	}
 
+	/*
+	 * In hot standby mode, check pg_xact.
+	 *
+	 * With normal non-CSN snapshots, you must be careful to check
+	 * TransactionIdIsInProgress() before checking pg_xact, because a
+	 * transaction is marked as committed before it's removed from PGPROC. But
+	 * during recovery, we now use CSN snapshots so I think that's OK. See the
+	 * "NOTE" at the top of heapam_visibility.c.
+	 *
+	 * During recovery, the XID cannot be our own transaction, and the CSN
+	 * check handles subtransactions too, so we can skip the rest of the
+	 * function.
+	 */
+	if (RecoveryInProgress())
+	{
+		xc_during_recovery_inc();
+		if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+			return false;
+		else
+			return true;
+	}
+
 	/*
 	 * Also, we can handle our own transaction (and subtransactions) without
 	 * any access to shared memory.
@@ -1451,12 +1066,7 @@ TransactionIdIsInProgress(TransactionId xid)
 	 */
 	if (xids == NULL)
 	{
-		/*
-		 * In hot standby mode, reserve enough space to hold all xids in the
-		 * known-assigned list. If we later finish recovery, we no longer need
-		 * the bigger array, but we don't bother to shrink it.
-		 */
-		int			maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs;
+		int			maxxids = arrayP->maxProcs;
 
 		xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId));
 		if (xids == NULL)
@@ -1551,33 +1161,6 @@ TransactionIdIsInProgress(TransactionId xid)
 			xids[nxids++] = pxid;
 	}
 
-	/*
-	 * Step 3: in hot standby mode, check the known-assigned-xids list.  XIDs
-	 * in the list must be treated as running.
-	 */
-	if (RecoveryInProgress())
-	{
-		/* none of the PGPROC entries should have XIDs in hot standby mode */
-		Assert(nxids == 0);
-
-		if (KnownAssignedXidExists(xid))
-		{
-			LWLockRelease(ProcArrayLock);
-			xc_by_known_assigned_inc();
-			return true;
-		}
-
-		/*
-		 * If the KnownAssignedXids overflowed, we have to check pg_subtrans
-		 * too.  Fetch all xids from KnownAssignedXids that are lower than
-		 * xid, since if xid is a subtransaction its parent will always have a
-		 * lower value.  Note we will collect both main and subXIDs here, but
-		 * there's no help for it.
-		 */
-		if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid))
-			nxids = KnownAssignedXidsGet(xids, xid);
-	}
-
 	LWLockRelease(ProcArrayLock);
 
 	/*
@@ -1851,8 +1434,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		 * can't be tied to a specific database.)
 		 *
 		 * Also, while in recovery we cannot compute an accurate per-database
-		 * horizon, as all xids are managed via the KnownAssignedXids
-		 * machinery.
+		 * horizon, as all xids are managed via the CSN log machinery.
 		 */
 		if (proc->databaseId == MyDatabaseId ||
 			MyDatabaseId == InvalidOid ||
@@ -1865,11 +1447,14 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	}
 
 	/*
-	 * If in recovery fetch oldest xid in KnownAssignedXids, will be applied
-	 * after lock is released.
+	 * If in recovery fetch oldest xid from last checkpoint.
+	 *
+	 * XXX: that can be much older than what we had previously with the
+	 * known-assigned-xids machinery. I think that's OK, given what this
+	 * function is used for during recovery?
 	 */
 	if (in_recovery)
-		kaxmin = KnownAssignedXidsGetOldestXmin();
+		kaxmin = procArray->oldest_running_primary_xid;
 
 	/*
 	 * No other information from shared state is needed, release the lock
@@ -2188,7 +1773,7 @@ GetSnapshotData(Snapshot snapshot)
 	int			mypgxactoff;
 	TransactionId myxid;
 	uint64		curXactCompletionCount;
-
+	XLogRecPtr	csn = InvalidXLogRecPtr;
 	TransactionId replication_slot_xmin = InvalidTransactionId;
 	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
 
@@ -2368,27 +1953,8 @@ GetSnapshotData(Snapshot snapshot)
 	else
 	{
 		/*
-		 * We're in hot standby, so get XIDs from KnownAssignedXids.
-		 *
-		 * We store all xids directly into subxip[]. Here's why:
-		 *
-		 * In recovery we don't know which xids are top-level and which are
-		 * subxacts, a design choice that greatly simplifies xid processing.
-		 *
-		 * It seems like we would want to try to put xids into xip[] only, but
-		 * that is fairly small. We would either need to make that bigger or
-		 * to increase the rate at which we WAL-log xid assignment; neither is
-		 * an appealing choice.
-		 *
-		 * We could try to store xids into xip[] first and then into subxip[]
-		 * if there are too many xids. That only works if the snapshot doesn't
-		 * overflow because we do not search subxip[] in that case. A simpler
-		 * way is to just store all xids in the subxip array because this is
-		 * by far the bigger array. We just leave the xip array empty.
-		 *
-		 * Either way we need to change the way XidInMVCCSnapshot() works
-		 * depending upon when the snapshot was taken, or change normal
-		 * snapshot processing so it matches.
+		 * We're in hot standby, so get the current CSN. That's used to
+		 * determine which transactions committed before this snapshot.
 		 *
 		 * Note: It is possible for recovery to end before we finish taking
 		 * the snapshot, and for newly assigned transaction ids to be added to
@@ -2396,14 +1962,17 @@ GetSnapshotData(Snapshot snapshot)
 		 * those newly added transaction ids would be filtered away, so we
 		 * need not be concerned about them.
 		 */
-		subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin,
-												  xmax);
+		xmin = procArray->oldest_running_primary_xid;
 
-		if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid))
-			suboverflowed = true;
+		/*
+		 * Take CSN under ProcArrayLock so the snapshot stays synchronized.
+		 * (XXX: not sure that's strictly required.)
+		 * This is what determines which transactions we consider finished and
+		 * which are still in progress.
+		 */
+		csn = TransamVariables->latestCommitLSN;
 	}
 
-
 	/*
 	 * Fetch into local variable while ProcArrayLock is held - the
 	 * LWLockRelease below is a barrier, ensuring this happens inside the
@@ -2519,6 +2088,8 @@ GetSnapshotData(Snapshot snapshot)
 	snapshot->lsn = InvalidXLogRecPtr;
 	snapshot->whenTaken = 0;
 
+	snapshot->snapshotCsn = csn;
+
 	return snapshot;
 }
 
@@ -2674,9 +2245,6 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
  * The returned data structure is statically allocated; caller should not
  * modify it, and must not assume it is valid past the next call.
  *
- * This is never executed during recovery so there is no need to look at
- * KnownAssignedXids.
- *
  * Dummy PGPROCs from prepared transaction are included, meaning that this
  * may return entries with duplicated TransactionId values coming from
  * transaction finishing to prepare.  Nothing is done about duplicated
@@ -2707,6 +2275,7 @@ GetRunningTransactionData(void)
 	int			subcount;
 	bool		suboverflowed;
 
+	/* This is never executed during recovery */
 	Assert(!RecoveryInProgress());
 
 	/*
@@ -2873,15 +2442,16 @@ GetRunningTransactionData(void)
  * We look at all databases, though there is no need to include WALSender
  * since this has no effect on hot standby conflicts.
  *
- * This is never executed during recovery so there is no need to look at
- * KnownAssignedXids.
+ * If allDbs is false, skip processes attached to other databases.
+ *
+ * This is never executed during recovery.
  *
  * We don't worry about updating other counters, we want to keep this as
  * simple as possible and leave GetSnapshotData() as the primary code for
  * that bookkeeping.
  */
 TransactionId
-GetOldestActiveTransactionId(void)
+GetOldestActiveTransactionId(bool allDbs)
 {
 	ProcArrayStruct *arrayP = procArray;
 	TransactionId *other_xids = ProcGlobal->xids;
@@ -2902,11 +2472,13 @@ GetOldestActiveTransactionId(void)
 	LWLockRelease(XidGenLock);
 
 	/*
-	 * Spin over procArray collecting all xids and subxids.
+	 * Spin over procArray checking each xid.
 	 */
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
+		int			pgprocno = arrayP->pgprocnos[index];
+		PGPROC	   *proc = &allProcs[pgprocno];
 		TransactionId xid;
 
 		/* Fetch xid just once - see GetNewTransactionId */
@@ -2915,6 +2487,9 @@ GetOldestActiveTransactionId(void)
 		if (!TransactionIdIsNormal(xid))
 			continue;
 
+		if (!allDbs && proc->databaseId != MyDatabaseId)
+			continue;
+
 		if (TransactionIdPrecedes(xid, oldestRunningXid))
 			oldestRunningXid = xid;
 
@@ -2993,8 +2568,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
 	 *
 	 * In recovery we can't lower the safe value besides what we've computed
 	 * above, so we'll have to wait a bit longer there. We unfortunately can
-	 * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids
-	 * machinery can miss values and return an older value than is safe.
+	 * *not* use oldest_running_primary_xid since the XID tracking machinery
+	 * can miss values and return an older value than is safe.
 	 */
 	if (!recovery_in_progress)
 	{
@@ -3412,6 +2987,9 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
  * but that would not be true in the case of FATAL errors lagging in array,
  * but we already know those are bogus anyway, so we skip that test.
  *
+ * XXX: KnownAssignedXids is gone so the above comment needs updating. Is
+ * the code still correct? I think so but need to double-check.
+ *
  * If dbOid is valid we skip backends attached to other databases.
  *
  * Be careful to *not* pfree the result from this function. We reuse
@@ -4083,14 +3661,14 @@ static void
 DisplayXidCache(void)
 {
 	fprintf(stderr,
-			"XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n",
+			"XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, during_recovery: %ld, nooflo: %ld, slow: %ld\n",
 			xc_by_recent_xmin,
 			xc_by_known_xact,
 			xc_by_my_xact,
 			xc_by_latest_xid,
 			xc_by_main_xid,
 			xc_by_child_xid,
-			xc_by_known_assigned,
+			xc_during_recovery,
 			xc_no_overflow,
 			xc_slow_answer);
 }
@@ -4337,61 +3915,6 @@ FullXidRelativeTo(FullTransactionId rel, TransactionId xid)
 }
 
 
-/* ----------------------------------------------
- *		KnownAssignedTransactionIds sub-module
- * ----------------------------------------------
- */
-
-/*
- * In Hot Standby mode, we maintain a list of transactions that are (or were)
- * running on the primary at the current point in WAL.  These XIDs must be
- * treated as running by standby transactions, even though they are not in
- * the standby server's PGPROC array.
- *
- * We record all XIDs that we know have been assigned.  That includes all the
- * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
- * been assigned.  We can deduce the existence of unobserved XIDs because we
- * know XIDs are assigned in sequence, with no gaps.  The KnownAssignedXids
- * list expands as new XIDs are observed or inferred, and contracts when
- * transaction completion records arrive.
- *
- * During hot standby we do not fret too much about the distinction between
- * top-level XIDs and subtransaction XIDs. We store both together in the
- * KnownAssignedXids list.  In backends, this is copied into snapshots in
- * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot()
- * doesn't care about the distinction either.  Subtransaction XIDs are
- * effectively treated as top-level XIDs and in the typical case pg_subtrans
- * links are *not* maintained (which does not affect visibility).
- *
- * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
- * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every primary transaction must
- * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
- * least every PGPROC_MAX_CACHED_SUBXIDS.  When we receive one of these
- * records, we mark the subXIDs as children of the top XID in pg_subtrans,
- * and then remove them from KnownAssignedXids.  This prevents overflow of
- * KnownAssignedXids and snapshots, at the cost that status checks for these
- * subXIDs will take a slower path through TransactionIdIsInProgress().
- * This means that KnownAssignedXids is not necessarily complete for subXIDs,
- * though it should be complete for top-level XIDs; this is the same situation
- * that holds with respect to the PGPROC entries in normal running.
- *
- * When we throw away subXIDs from KnownAssignedXids, we need to keep track of
- * that, similarly to tracking overflow of a PGPROC's subxids array.  We do
- * that by remembering the lastOverflowedXid, ie the last thrown-away subXID.
- * As long as that is within the range of interesting XIDs, we have to assume
- * that subXIDs are missing from snapshots.  (Note that subXID overflow occurs
- * on primary when 65th subXID arrives, whereas on standby it occurs when 64th
- * subXID arrives - that is not an error.)
- *
- * Should a backend on primary somehow disappear before it can write an abort
- * record, then we just leave those XIDs in KnownAssignedXids. They actually
- * aborted but we think they were running; the distinction is irrelevant
- * because either way any changes done by the transaction are not visible to
- * backends in the standby.  We prune KnownAssignedXids when
- * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the
- * array due to such dead XIDs.
- */
-
 /*
  * RecordKnownAssignedTransactionIds
  *		Record the given XID in KnownAssignedXids, as well as any preceding
@@ -4406,7 +3929,7 @@ FullXidRelativeTo(FullTransactionId rel, TransactionId xid)
 void
 RecordKnownAssignedTransactionIds(TransactionId xid)
 {
-	Assert(standbyState >= STANDBY_INITIALIZED);
+	Assert(InHotStandby);
 	Assert(TransactionIdIsValid(xid));
 	Assert(TransactionIdIsValid(latestObservedXid));
 
@@ -4424,38 +3947,19 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
 
 		/*
 		 * Extend subtrans like we do in GetNewTransactionId() during normal
-		 * operation using individual extend steps. Note that we do not need
-		 * to extend clog since its extensions are WAL logged.
-		 *
-		 * This part has to be done regardless of standbyState since we
-		 * immediately start assigning subtransactions to their toplevel
-		 * transactions.
+		 * operation using individual extend steps. And CSN log, too. Note
+		 * that we do not need to extend clog since its extensions are WAL
+		 * logged.
 		 */
 		next_expected_xid = latestObservedXid;
 		while (TransactionIdPrecedes(next_expected_xid, xid))
 		{
 			TransactionIdAdvance(next_expected_xid);
 			ExtendSUBTRANS(next_expected_xid);
+			ExtendCSNLog(next_expected_xid);
 		}
 		Assert(next_expected_xid == xid);
 
-		/*
-		 * If the KnownAssignedXids machinery isn't up yet, there's nothing
-		 * more to do since we don't track assigned xids yet.
-		 */
-		if (standbyState <= STANDBY_INITIALIZED)
-		{
-			latestObservedXid = xid;
-			return;
-		}
-
-		/*
-		 * Add (latestObservedXid, xid] onto the KnownAssignedXids array.
-		 */
-		next_expected_xid = latestObservedXid;
-		TransactionIdAdvance(next_expected_xid);
-		KnownAssignedXidsAdd(next_expected_xid, xid, false);
-
 		/*
 		 * Now we can advance latestObservedXid
 		 */
@@ -4467,781 +3971,61 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
 }
 
 /*
- * ExpireTreeKnownAssignedTransactionIds
- *		Remove the given XIDs from KnownAssignedXids.
+ * ProcArrayRecoveryEndTransaction
+ *
+ * Called during recovery in analogy with and in place of
+ * ProcArrayEndTransaction(). The transaction becomes visible to any new
+ * snapshots taken after this. 'max_xid' is the highest (sub)XID of the
+ * committed transaction, and 'lsn' is LSN of the commit record.
  *
- * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
+ * The transaction and all its subtransactions have been already marked as
+ * committed in the CLOG and in the CSNLOG.
  */
 void
-ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
-									  TransactionId *subxids, TransactionId max_xid)
+ProcArrayRecoveryEndTransaction(TransactionId max_xid, XLogRecPtr lsn)
 {
-	Assert(standbyState >= STANDBY_INITIALIZED);
+	TransactionId oldest_running_primary_xid;
+
+	Assert(InHotStandby);
+
+	/*
+	 * If this was the the oldest XID that was still running, advance it.
+	 * This is important for advancing the global xmin, which avoids
+	 * unnecessary recovery conflicts
+	 *
+	 * No locking required because this runs in the startup process.
+	 *
+	 * XXX: the caller actually has a list of XIDs that just committed. We
+	 * could save some clog lookups by taking advantage of that list.
+	 */
+	oldest_running_primary_xid = procArray->oldest_running_primary_xid;
+	while (oldest_running_primary_xid < max_xid)
+	{
+		if (!TransactionIdDidCommit(oldest_running_primary_xid) &&
+			!TransactionIdDidAbort(oldest_running_primary_xid))
+		{
+			break;
+		}
+		TransactionIdAdvance(oldest_running_primary_xid);
+	}
+	if (max_xid == oldest_running_primary_xid)
+		TransactionIdAdvance(oldest_running_primary_xid);
 
 	/*
 	 * Uses same locking as transaction commit
 	 */
 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 
-	KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
-
 	/* As in ProcArrayEndTransaction, advance latestCompletedXid */
 	MaintainLatestCompletedXidRecovery(max_xid);
 
 	/* ... and xactCompletionCount */
 	TransamVariables->xactCompletionCount++;
 
-	LWLockRelease(ProcArrayLock);
-}
-
-/*
- * ExpireAllKnownAssignedTransactionIds
- *		Remove all entries in KnownAssignedXids and reset lastOverflowedXid.
- */
-void
-ExpireAllKnownAssignedTransactionIds(void)
-{
-	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-	KnownAssignedXidsRemovePreceding(InvalidTransactionId);
-
-	/*
-	 * Reset lastOverflowedXid.  Currently, lastOverflowedXid has no use after
-	 * the call of this function.  But do this for unification with what
-	 * ExpireOldKnownAssignedTransactionIds() do.
-	 */
-	procArray->lastOverflowedXid = InvalidTransactionId;
-	LWLockRelease(ProcArrayLock);
-}
-
-/*
- * ExpireOldKnownAssignedTransactionIds
- *		Remove KnownAssignedXids entries preceding the given XID and
- *		potentially reset lastOverflowedXid.
- */
-void
-ExpireOldKnownAssignedTransactionIds(TransactionId xid)
-{
-	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
-	/*
-	 * Reset lastOverflowedXid if we know all transactions that have been
-	 * possibly running are being gone.  Not doing so could cause an incorrect
-	 * lastOverflowedXid value, which makes extra snapshots be marked as
-	 * suboverflowed.
-	 */
-	if (TransactionIdPrecedes(procArray->lastOverflowedXid, xid))
-		procArray->lastOverflowedXid = InvalidTransactionId;
-	KnownAssignedXidsRemovePreceding(xid);
-	LWLockRelease(ProcArrayLock);
-}
-
-/*
- * KnownAssignedTransactionIdsIdleMaintenance
- *		Opportunistically do maintenance work when the startup process
- *		is about to go idle.
- */
-void
-KnownAssignedTransactionIdsIdleMaintenance(void)
-{
-	KnownAssignedXidsCompress(KAX_STARTUP_PROCESS_IDLE, false);
-}
-
-
-/*
- * Private module functions to manipulate KnownAssignedXids
- *
- * There are 5 main uses of the KnownAssignedXids data structure:
- *
- *	* backends taking snapshots - all valid XIDs need to be copied out
- *	* backends seeking to determine presence of a specific XID
- *	* startup process adding new known-assigned XIDs
- *	* startup process removing specific XIDs as transactions end
- *	* startup process pruning array when special WAL records arrive
- *
- * This data structure is known to be a hot spot during Hot Standby, so we
- * go to some lengths to make these operations as efficient and as concurrent
- * as possible.
- *
- * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes
- * order, to be exact --- to allow binary search for specific XIDs.  Note:
- * in general TransactionIdPrecedes would not provide a total order, but
- * we know that the entries present at any instant should not extend across
- * a large enough fraction of XID space to wrap around (the primary would
- * shut down for fear of XID wrap long before that happens).  So it's OK to
- * use TransactionIdPrecedes as a binary-search comparator.
- *
- * It's cheap to maintain the sortedness during insertions, since new known
- * XIDs are always reported in XID order; we just append them at the right.
- *
- * To keep individual deletions cheap, we need to allow gaps in the array.
- * This is implemented by marking array elements as valid or invalid using
- * the parallel boolean array KnownAssignedXidsValid[].  A deletion is done
- * by setting KnownAssignedXidsValid[i] to false, *without* clearing the
- * XID entry itself.  This preserves the property that the XID entries are
- * sorted, so we can do binary searches easily.  Periodically we compress
- * out the unused entries; that's much cheaper than having to compress the
- * array immediately on every deletion.
- *
- * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[]
- * are those with indexes tail <= i < head; items outside this subscript range
- * have unspecified contents.  When head reaches the end of the array, we
- * force compression of unused entries rather than wrapping around, since
- * allowing wraparound would greatly complicate the search logic.  We maintain
- * an explicit tail pointer so that pruning of old XIDs can be done without
- * immediately moving the array contents.  In most cases only a small fraction
- * of the array contains valid entries at any instant.
- *
- * Although only the startup process can ever change the KnownAssignedXids
- * data structure, we still need interlocking so that standby backends will
- * not observe invalid intermediate states.  The convention is that backends
- * must hold shared ProcArrayLock to examine the array.  To remove XIDs from
- * the array, the startup process must hold ProcArrayLock exclusively, for
- * the usual transactional reasons (compare commit/abort of a transaction
- * during normal running).  Compressing unused entries out of the array
- * likewise requires exclusive lock.  To add XIDs to the array, we just insert
- * them into slots to the right of the head pointer and then advance the head
- * pointer.  This doesn't require any lock at all, but on machines with weak
- * memory ordering, we need to be careful that other processors see the array
- * element changes before they see the head pointer change.  We handle this by
- * using memory barriers when reading or writing the head/tail pointers (unless
- * the caller holds ProcArrayLock exclusively).
- *
- * Algorithmic analysis:
- *
- * If we have a maximum of M slots, with N XIDs currently spread across
- * S elements then we have N <= S <= M always.
- *
- *	* Adding a new XID is O(1) and needs no lock (unless compression must
- *		happen)
- *	* Compressing the array is O(S) and requires exclusive lock
- *	* Removing an XID is O(logS) and requires exclusive lock
- *	* Taking a snapshot is O(S) and requires shared lock
- *	* Checking for an XID is O(logS) and requires shared lock
- *
- * In comparison, using a hash table for KnownAssignedXids would mean that
- * taking snapshots would be O(M). If we can maintain S << M then the
- * sorted array technique will deliver significantly faster snapshots.
- * If we try to keep S too small then we will spend too much time compressing,
- * so there is an optimal point for any workload mix. We use a heuristic to
- * decide when to compress the array, though trimming also helps reduce
- * frequency of compressing. The heuristic requires us to track the number of
- * currently valid XIDs in the array (N).  Except in special cases, we'll
- * compress when S >= 2N.  Bounding S at 2N in turn bounds the time for
- * taking a snapshot to be O(N), which it would have to be anyway.
- */
-
-
-/*
- * Compress KnownAssignedXids by shifting valid data down to the start of the
- * array, removing any gaps.
- *
- * A compression step is forced if "reason" is KAX_NO_SPACE, otherwise
- * we do it only if a heuristic indicates it's a good time to do it.
- *
- * Compression requires holding ProcArrayLock in exclusive mode.
- * Caller must pass haveLock = true if it already holds the lock.
- */
-static void
-KnownAssignedXidsCompress(KAXCompressReason reason, bool haveLock)
-{
-	ProcArrayStruct *pArray = procArray;
-	int			head,
-				tail,
-				nelements;
-	int			compress_index;
-	int			i;
-
-	/* Counters for compression heuristics */
-	static unsigned int transactionEndsCounter;
-	static TimestampTz lastCompressTs;
-
-	/* Tuning constants */
-#define KAX_COMPRESS_FREQUENCY 128	/* in transactions */
-#define KAX_COMPRESS_IDLE_INTERVAL 1000 /* in ms */
-
-	/*
-	 * Since only the startup process modifies the head/tail pointers, we
-	 * don't need a lock to read them here.
-	 */
-	head = pArray->headKnownAssignedXids;
-	tail = pArray->tailKnownAssignedXids;
-	nelements = head - tail;
-
-	/*
-	 * If we can choose whether to compress, use a heuristic to avoid
-	 * compressing too often or not often enough.  "Compress" here simply
-	 * means moving the values to the beginning of the array, so it is not as
-	 * complex or costly as typical data compression algorithms.
-	 */
-	if (nelements == pArray->numKnownAssignedXids)
-	{
-		/*
-		 * When there are no gaps between head and tail, don't bother to
-		 * compress, except in the KAX_NO_SPACE case where we must compress to
-		 * create some space after the head.
-		 */
-		if (reason != KAX_NO_SPACE)
-			return;
-	}
-	else if (reason == KAX_TRANSACTION_END)
-	{
-		/*
-		 * Consider compressing only once every so many commits.  Frequency
-		 * determined by benchmarks.
-		 */
-		if ((transactionEndsCounter++) % KAX_COMPRESS_FREQUENCY != 0)
-			return;
-
-		/*
-		 * Furthermore, compress only if the used part of the array is less
-		 * than 50% full (see comments above).
-		 */
-		if (nelements < 2 * pArray->numKnownAssignedXids)
-			return;
-	}
-	else if (reason == KAX_STARTUP_PROCESS_IDLE)
-	{
-		/*
-		 * We're about to go idle for lack of new WAL, so we might as well
-		 * compress.  But not too often, to avoid ProcArray lock contention
-		 * with readers.
-		 */
-		if (lastCompressTs != 0)
-		{
-			TimestampTz compress_after;
-
-			compress_after = TimestampTzPlusMilliseconds(lastCompressTs,
-														 KAX_COMPRESS_IDLE_INTERVAL);
-			if (GetCurrentTimestamp() < compress_after)
-				return;
-		}
-	}
-
-	/* Need to compress, so get the lock if we don't have it. */
-	if (!haveLock)
-		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-
-	/*
-	 * We compress the array by reading the valid values from tail to head,
-	 * re-aligning data to 0th element.
-	 */
-	compress_index = 0;
-	for (i = tail; i < head; i++)
-	{
-		if (KnownAssignedXidsValid[i])
-		{
-			KnownAssignedXids[compress_index] = KnownAssignedXids[i];
-			KnownAssignedXidsValid[compress_index] = true;
-			compress_index++;
-		}
-	}
-	Assert(compress_index == pArray->numKnownAssignedXids);
-
-	pArray->tailKnownAssignedXids = 0;
-	pArray->headKnownAssignedXids = compress_index;
-
-	if (!haveLock)
-		LWLockRelease(ProcArrayLock);
-
-	/* Update timestamp for maintenance.  No need to hold lock for this. */
-	lastCompressTs = GetCurrentTimestamp();
-}
-
-/*
- * Add xids into KnownAssignedXids at the head of the array.
- *
- * xids from from_xid to to_xid, inclusive, are added to the array.
- *
- * If exclusive_lock is true then caller already holds ProcArrayLock in
- * exclusive mode, so we need no extra locking here.  Else caller holds no
- * lock, so we need to be sure we maintain sufficient interlocks against
- * concurrent readers.  (Only the startup process ever calls this, so no need
- * to worry about concurrent writers.)
- */
-static void
-KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
-					 bool exclusive_lock)
-{
-	ProcArrayStruct *pArray = procArray;
-	TransactionId next_xid;
-	int			head,
-				tail;
-	int			nxids;
-	int			i;
-
-	Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid));
-
-	/*
-	 * Calculate how many array slots we'll need.  Normally this is cheap; in
-	 * the unusual case where the XIDs cross the wrap point, we do it the hard
-	 * way.
-	 */
-	if (to_xid >= from_xid)
-		nxids = to_xid - from_xid + 1;
-	else
-	{
-		nxids = 1;
-		next_xid = from_xid;
-		while (TransactionIdPrecedes(next_xid, to_xid))
-		{
-			nxids++;
-			TransactionIdAdvance(next_xid);
-		}
-	}
-
-	/*
-	 * Since only the startup process modifies the head/tail pointers, we
-	 * don't need a lock to read them here.
-	 */
-	head = pArray->headKnownAssignedXids;
-	tail = pArray->tailKnownAssignedXids;
-
-	Assert(head >= 0 && head <= pArray->maxKnownAssignedXids);
-	Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids);
-
-	/*
-	 * Verify that insertions occur in TransactionId sequence.  Note that even
-	 * if the last existing element is marked invalid, it must still have a
-	 * correctly sequenced XID value.
-	 */
-	if (head > tail &&
-		TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid))
-	{
-		KnownAssignedXidsDisplay(LOG);
-		elog(ERROR, "out-of-order XID insertion in KnownAssignedXids");
-	}
-
-	/*
-	 * If our xids won't fit in the remaining space, compress out free space
-	 */
-	if (head + nxids > pArray->maxKnownAssignedXids)
-	{
-		KnownAssignedXidsCompress(KAX_NO_SPACE, exclusive_lock);
-
-		head = pArray->headKnownAssignedXids;
-		/* note: we no longer care about the tail pointer */
-
-		/*
-		 * If it still won't fit then we're out of memory
-		 */
-		if (head + nxids > pArray->maxKnownAssignedXids)
-			elog(ERROR, "too many KnownAssignedXids");
-	}
-
-	/* Now we can insert the xids into the space starting at head */
-	next_xid = from_xid;
-	for (i = 0; i < nxids; i++)
-	{
-		KnownAssignedXids[head] = next_xid;
-		KnownAssignedXidsValid[head] = true;
-		TransactionIdAdvance(next_xid);
-		head++;
-	}
-
-	/* Adjust count of number of valid entries */
-	pArray->numKnownAssignedXids += nxids;
-
-	/*
-	 * Now update the head pointer.  We use a write barrier to ensure that
-	 * other processors see the above array updates before they see the head
-	 * pointer change.  The barrier isn't required if we're holding
-	 * ProcArrayLock exclusively.
-	 */
-	if (!exclusive_lock)
-		pg_write_barrier();
-
-	pArray->headKnownAssignedXids = head;
-}
-
-/*
- * KnownAssignedXidsSearch
- *
- * Searches KnownAssignedXids for a specific xid and optionally removes it.
- * Returns true if it was found, false if not.
- *
- * Caller must hold ProcArrayLock in shared or exclusive mode.
- * Exclusive lock must be held for remove = true.
- */
-static bool
-KnownAssignedXidsSearch(TransactionId xid, bool remove)
-{
-	ProcArrayStruct *pArray = procArray;
-	int			first,
-				last;
-	int			head;
-	int			tail;
-	int			result_index = -1;
-
-	tail = pArray->tailKnownAssignedXids;
-	head = pArray->headKnownAssignedXids;
-
-	/*
-	 * Only the startup process removes entries, so we don't need the read
-	 * barrier in that case.
-	 */
-	if (!remove)
-		pg_read_barrier();		/* pairs with KnownAssignedXidsAdd */
-
-	/*
-	 * Standard binary search.  Note we can ignore the KnownAssignedXidsValid
-	 * array here, since even invalid entries will contain sorted XIDs.
-	 */
-	first = tail;
-	last = head - 1;
-	while (first <= last)
-	{
-		int			mid_index;
-		TransactionId mid_xid;
-
-		mid_index = (first + last) / 2;
-		mid_xid = KnownAssignedXids[mid_index];
-
-		if (xid == mid_xid)
-		{
-			result_index = mid_index;
-			break;
-		}
-		else if (TransactionIdPrecedes(xid, mid_xid))
-			last = mid_index - 1;
-		else
-			first = mid_index + 1;
-	}
-
-	if (result_index < 0)
-		return false;			/* not in array */
-
-	if (!KnownAssignedXidsValid[result_index])
-		return false;			/* in array, but invalid */
-
-	if (remove)
-	{
-		KnownAssignedXidsValid[result_index] = false;
-
-		pArray->numKnownAssignedXids--;
-		Assert(pArray->numKnownAssignedXids >= 0);
-
-		/*
-		 * If we're removing the tail element then advance tail pointer over
-		 * any invalid elements.  This will speed future searches.
-		 */
-		if (result_index == tail)
-		{
-			tail++;
-			while (tail < head && !KnownAssignedXidsValid[tail])
-				tail++;
-			if (tail >= head)
-			{
-				/* Array is empty, so we can reset both pointers */
-				pArray->headKnownAssignedXids = 0;
-				pArray->tailKnownAssignedXids = 0;
-			}
-			else
-			{
-				pArray->tailKnownAssignedXids = tail;
-			}
-		}
-	}
-
-	return true;
-}
-
-/*
- * Is the specified XID present in KnownAssignedXids[]?
- *
- * Caller must hold ProcArrayLock in shared or exclusive mode.
- */
-static bool
-KnownAssignedXidExists(TransactionId xid)
-{
-	Assert(TransactionIdIsValid(xid));
-
-	return KnownAssignedXidsSearch(xid, false);
-}
-
-/*
- * Remove the specified XID from KnownAssignedXids[].
- *
- * Caller must hold ProcArrayLock in exclusive mode.
- */
-static void
-KnownAssignedXidsRemove(TransactionId xid)
-{
-	Assert(TransactionIdIsValid(xid));
-
-	elog(DEBUG4, "remove KnownAssignedXid %u", xid);
-
-	/*
-	 * Note: we cannot consider it an error to remove an XID that's not
-	 * present.  We intentionally remove subxact IDs while processing
-	 * XLOG_XACT_ASSIGNMENT, to avoid array overflow.  Then those XIDs will be
-	 * removed again when the top-level xact commits or aborts.
-	 *
-	 * It might be possible to track such XIDs to distinguish this case from
-	 * actual errors, but it would be complicated and probably not worth it.
-	 * So, just ignore the search result.
-	 */
-	(void) KnownAssignedXidsSearch(xid, true);
-}
-
-/*
- * KnownAssignedXidsRemoveTree
- *		Remove xid (if it's not InvalidTransactionId) and all the subxids.
- *
- * Caller must hold ProcArrayLock in exclusive mode.
- */
-static void
-KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
-							TransactionId *subxids)
-{
-	int			i;
-
-	if (TransactionIdIsValid(xid))
-		KnownAssignedXidsRemove(xid);
-
-	for (i = 0; i < nsubxids; i++)
-		KnownAssignedXidsRemove(subxids[i]);
-
-	/* Opportunistically compress the array */
-	KnownAssignedXidsCompress(KAX_TRANSACTION_END, true);
-}
-
-/*
- * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid
- * then clear the whole table.
- *
- * Caller must hold ProcArrayLock in exclusive mode.
- */
-static void
-KnownAssignedXidsRemovePreceding(TransactionId removeXid)
-{
-	ProcArrayStruct *pArray = procArray;
-	int			count = 0;
-	int			head,
-				tail,
-				i;
-
-	if (!TransactionIdIsValid(removeXid))
-	{
-		elog(DEBUG4, "removing all KnownAssignedXids");
-		pArray->numKnownAssignedXids = 0;
-		pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0;
-		return;
-	}
-
-	elog(DEBUG4, "prune KnownAssignedXids to %u", removeXid);
-
-	/*
-	 * Mark entries invalid starting at the tail.  Since array is sorted, we
-	 * can stop as soon as we reach an entry >= removeXid.
-	 */
-	tail = pArray->tailKnownAssignedXids;
-	head = pArray->headKnownAssignedXids;
-
-	for (i = tail; i < head; i++)
-	{
-		if (KnownAssignedXidsValid[i])
-		{
-			TransactionId knownXid = KnownAssignedXids[i];
-
-			if (TransactionIdFollowsOrEquals(knownXid, removeXid))
-				break;
-
-			if (!StandbyTransactionIdIsPrepared(knownXid))
-			{
-				KnownAssignedXidsValid[i] = false;
-				count++;
-			}
-		}
-	}
-
-	pArray->numKnownAssignedXids -= count;
-	Assert(pArray->numKnownAssignedXids >= 0);
-
-	/*
-	 * Advance the tail pointer if we've marked the tail item invalid.
-	 */
-	for (i = tail; i < head; i++)
-	{
-		if (KnownAssignedXidsValid[i])
-			break;
-	}
-	if (i >= head)
-	{
-		/* Array is empty, so we can reset both pointers */
-		pArray->headKnownAssignedXids = 0;
-		pArray->tailKnownAssignedXids = 0;
-	}
-	else
-	{
-		pArray->tailKnownAssignedXids = i;
-	}
-
-	/* Opportunistically compress the array */
-	KnownAssignedXidsCompress(KAX_PRUNE, true);
-}
-
-/*
- * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids.
- * We filter out anything >= xmax.
- *
- * Returns the number of XIDs stored into xarray[].  Caller is responsible
- * that array is large enough.
- *
- * Caller must hold ProcArrayLock in (at least) shared mode.
- */
-static int
-KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax)
-{
-	TransactionId xtmp = InvalidTransactionId;
-
-	return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax);
-}
-
-/*
- * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus
- * we reduce *xmin to the lowest xid value seen if not already lower.
- *
- * Caller must hold ProcArrayLock in (at least) shared mode.
- */
-static int
-KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
-							   TransactionId xmax)
-{
-	int			count = 0;
-	int			head,
-				tail;
-	int			i;
-
-	/*
-	 * Fetch head just once, since it may change while we loop. We can stop
-	 * once we reach the initially seen head, since we are certain that an xid
-	 * cannot enter and then leave the array while we hold ProcArrayLock.  We
-	 * might miss newly-added xids, but they should be >= xmax so irrelevant
-	 * anyway.
-	 */
-	tail = procArray->tailKnownAssignedXids;
-	head = procArray->headKnownAssignedXids;
-
-	pg_read_barrier();			/* pairs with KnownAssignedXidsAdd */
-
-	for (i = tail; i < head; i++)
-	{
-		/* Skip any gaps in the array */
-		if (KnownAssignedXidsValid[i])
-		{
-			TransactionId knownXid = KnownAssignedXids[i];
-
-			/*
-			 * Update xmin if required.  Only the first XID need be checked,
-			 * since the array is sorted.
-			 */
-			if (count == 0 &&
-				TransactionIdPrecedes(knownXid, *xmin))
-				*xmin = knownXid;
-
-			/*
-			 * Filter out anything >= xmax, again relying on sorted property
-			 * of array.
-			 */
-			if (TransactionIdIsValid(xmax) &&
-				TransactionIdFollowsOrEquals(knownXid, xmax))
-				break;
-
-			/* Add knownXid into output array */
-			xarray[count++] = knownXid;
-		}
-	}
-
-	return count;
-}
-
-/*
- * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId
- * if nothing there.
- */
-static TransactionId
-KnownAssignedXidsGetOldestXmin(void)
-{
-	int			head,
-				tail;
-	int			i;
-
-	/*
-	 * Fetch head just once, since it may change while we loop.
-	 */
-	tail = procArray->tailKnownAssignedXids;
-	head = procArray->headKnownAssignedXids;
-
-	pg_read_barrier();			/* pairs with KnownAssignedXidsAdd */
-
-	for (i = tail; i < head; i++)
-	{
-		/* Skip any gaps in the array */
-		if (KnownAssignedXidsValid[i])
-			return KnownAssignedXids[i];
-	}
-
-	return InvalidTransactionId;
-}
-
-/*
- * Display KnownAssignedXids to provide debug trail
- *
- * Currently this is only called within startup process, so we need no
- * special locking.
- *
- * Note this is pretty expensive, and much of the expense will be incurred
- * even if the elog message will get discarded.  It's not currently called
- * in any performance-critical places, however, so no need to be tenser.
- */
-static void
-KnownAssignedXidsDisplay(int trace_level)
-{
-	ProcArrayStruct *pArray = procArray;
-	StringInfoData buf;
-	int			head,
-				tail,
-				i;
-	int			nxids = 0;
-
-	tail = pArray->tailKnownAssignedXids;
-	head = pArray->headKnownAssignedXids;
-
-	initStringInfo(&buf);
-
-	for (i = tail; i < head; i++)
-	{
-		if (KnownAssignedXidsValid[i])
-		{
-			nxids++;
-			appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]);
-		}
-	}
-
-	elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",
-		 nxids,
-		 pArray->numKnownAssignedXids,
-		 pArray->tailKnownAssignedXids,
-		 pArray->headKnownAssignedXids,
-		 buf.data);
-
-	pfree(buf.data);
-}
-
-/*
- * KnownAssignedXidsReset
- *		Resets KnownAssignedXids to be empty
- */
-static void
-KnownAssignedXidsReset(void)
-{
-	ProcArrayStruct *pArray = procArray;
-
-	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	Assert(lsn > TransamVariables->latestCommitLSN);
+	TransamVariables->latestCommitLSN = lsn;
 
-	pArray->numKnownAssignedXids = 0;
-	pArray->tailKnownAssignedXids = 0;
-	pArray->headKnownAssignedXids = 0;
+	procArray->oldest_running_primary_xid = oldest_running_primary_xid;
 
 	LWLockRelease(ProcArrayLock);
 }
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 25267f0f85..e02c9ab842 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -139,8 +139,6 @@ InitRecoveryTransactionEnvironment(void)
 	vxid.procNumber = MyProcNumber;
 	vxid.localTransactionId = GetNextLocalTransactionId();
 	VirtualXactLockTableInsert(vxid);
-
-	standbyState = STANDBY_INITIALIZED;
 }
 
 /*
@@ -168,9 +166,6 @@ ShutdownRecoveryTransactionEnvironment(void)
 	if (RecoveryLockHash == NULL)
 		return;
 
-	/* Mark all tracked in-progress transactions as finished. */
-	ExpireAllKnownAssignedTransactionIds();
-
 	/* Release all locks the tracked transactions were holding */
 	StandbyReleaseAllLocks();
 
@@ -1167,7 +1162,7 @@ standby_redo(XLogReaderState *record)
 	Assert(!XLogRecHasAnyBlockRefs(record));
 
 	/* Do nothing if we're not in hot standby mode */
-	if (standbyState == STANDBY_DISABLED)
+	if (!InHotStandby)
 		return;
 
 	if (info == XLOG_STANDBY_LOCK)
@@ -1182,18 +1177,21 @@ standby_redo(XLogReaderState *record)
 	}
 	else if (info == XLOG_RUNNING_XACTS)
 	{
+		/*
+		 * XXX: running xacts records were previously used to update
+		 * known-assigned xids, but now we only need it for the logical
+		 * replication snapbuilder stuff. And for the
+		 * pg_stat_report_stat(true) call below.
+		 */
 		xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
-		RunningTransactionsData running;
 
-		running.xcnt = xlrec->xcnt;
-		running.subxcnt = xlrec->subxcnt;
-		running.subxid_status = xlrec->subxid_overflow ? SUBXIDS_MISSING : SUBXIDS_IN_ARRAY;
-		running.nextXid = xlrec->nextXid;
-		running.latestCompletedXid = xlrec->latestCompletedXid;
-		running.oldestRunningXid = xlrec->oldestRunningXid;
-		running.xids = xlrec->xids;
-
-		ProcArrayApplyRecoveryInfo(&running);
+		/*
+		 * Remember the oldest XID that was running at the time. Normally, all
+		 * transaction aborts and commits are WAL-logged, so our
+		 * oldestRunningXid value should be up-to-date, but if not, this
+		 * allows us to resynchronize.
+		 */
+		ProcArrayUpdateOldestRunningXid(xlrec->oldestRunningXid);
 
 		/*
 		 * The startup process currently has no convenient way to schedule
@@ -1224,50 +1222,46 @@ standby_redo(XLogReaderState *record)
  *
  * This is used for Hot Standby as follows:
  *
- * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
- * start from a shutdown checkpoint because we know nothing was running
- * at that time and our recovery snapshot is known empty. In the more
- * typical case of an online checkpoint we need to jump through a few
- * hoops to get a correct recovery snapshot and this requires a two or
- * sometimes a three stage process.
+ * We can enter hot standby mode and start accepting read-only queries
+ * immediately at startup if we start from a shutdown checkpoint, because we
+ * know nothing was running at that time and our recovery snapshot is known
+ * empty. In the more typical case of an online checkpoint, the checkpoint
+ * record doesn't contain all the necessary information about running
+ * transaction state, and we need to jump through a few hoops to get a correct
+ * recovery snapshot.
  *
- * The initial snapshot must contain all running xids and all current
- * AccessExclusiveLocks at a point in time on the standby. Assembling
- * that information while the server is running requires many and
- * various LWLocks, so we choose to derive that information piece by
- * piece and then re-assemble that info on the standby. When that
- * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
+ * The initial snapshot must contain all current AccessExclusiveLocks at a
+ * point in time on the standby. Assembling that information while the server
+ * is running requires many and various LWLocks, so we choose to derive that
+ * information piece by piece and then re-assemble that info on the standby.
  *
- * Since locking on the primary when we derive the information is not
- * strict, we note that there is a time window between the derivation and
- * writing to WAL of the derived information. That allows race conditions
- * that we must resolve, since xids and locks may enter or leave the
- * snapshot during that window. This creates the issue that an xid or
- * lock may start *after* the snapshot has been derived yet *before* the
- * snapshot is logged in the running xacts WAL record. We resolve this by
- * starting to accumulate changes at a point just prior to when we derive
- * the snapshot on the primary, then ignore duplicates when we later apply
- * the snapshot from the running xacts record. This is implemented during
- * CreateCheckPoint() where we use the logical checkpoint location as
- * our starting point and then write the running xacts record immediately
- * before writing the main checkpoint WAL record. Since we always start
- * up from a checkpoint and are immediately at our starting point, we
- * unconditionally move to STANDBY_INITIALIZED. After this point we
- * must do 4 things:
+ * Since locking on the primary when we derive the information is not strict,
+ * there is a time window between the derivation and writing to WAL of the
+ * derived information. That allows race conditions that we must resolve,
+ * since xids and locks may enter or leave the snapshot during that
+ * window. This creates the issue that an xid or lock may start *after* the
+ * snapshot has been derived yet *before* the snapshot is logged in the
+ * running xacts WAL record. We resolve this by starting to accumulate changes
+ * at a point just prior to when we collect the lock information on the
+ * primary, then ignore duplicates when we later apply the snapshot from the
+ * running xacts record. This is implemented during CreateCheckPoint() where
+ * we use the logical checkpoint location as our starting point and then write
+ * the running xacts record immediately before writing the main checkpoint WAL
+ * record. Since we always start up from a checkpoint's redo pointer, we will
+ * always see a running-xacts record between before reaching the checkpoint
+ * record, and can immediately enter hot standby mode. After this point we
+ * must do 3 things:
  *	* move shared nextXid forwards as we see new xids
  *	* extend the clog and subtrans with each new xid
- *	* keep track of uncommitted known assigned xids
  *	* keep track of uncommitted AccessExclusiveLocks
  *
- * When we see a commit/abort we must remove known assigned xids and locks
- * from the completing transaction. Attempted removals that cannot locate
- * an entry are expected and must not cause an error when we are in state
- * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
- * KnownAssignedXidsRemove().
- *
- * Later, when we apply the running xact data we must be careful to ignore
- * transactions already committed, since those commits raced ahead when
- * making WAL entries.
+ * When we see a commit/abort we must advance oldest_running_primary_xid and
+ * remove locks from the completing transaction. Attempted removals that
+ * cannot locate an entry are expected and must not cause an error until we
+ * have seen the running-xacts record. (We don't throw an error even after
+ * that, because whatever the reason was, after the transaction has completed
+ * the issue has already been resolved anyway.) This is implemented in
+ * StandbyReleaseLocks().
  *
  * For logical decoding only the running xacts information is needed;
  * there's no need to look at the locking information, but it's logged anyway,
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index db6ed784ab..60f93a39a4 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -130,6 +130,7 @@ static const char *const BuiltinTrancheNames[] = {
 	[LWTRANCHE_XACT_BUFFER] = "XactBuffer",
 	[LWTRANCHE_COMMITTS_BUFFER] = "CommitTsBuffer",
 	[LWTRANCHE_SUBTRANS_BUFFER] = "SubtransBuffer",
+	[LWTRANCHE_CSN_LOG_BUFFER] = "CsnLogBuffer",
 	[LWTRANCHE_MULTIXACTOFFSET_BUFFER] = "MultiXactOffsetBuffer",
 	[LWTRANCHE_MULTIXACTMEMBER_BUFFER] = "MultiXactMemberBuffer",
 	[LWTRANCHE_NOTIFY_BUFFER] = "NotifyBuffer",
@@ -166,6 +167,7 @@ static const char *const BuiltinTrancheNames[] = {
 	[LWTRANCHE_SUBTRANS_SLRU] = "SubtransSLRU",
 	[LWTRANCHE_XACT_SLRU] = "XactSLRU",
 	[LWTRANCHE_PARALLEL_VACUUM_DSA] = "ParallelVacuumDSA",
+	[LWTRANCHE_CSN_LOG_SLRU] = "CsnLogSLRU",
 };
 
 StaticAssertDecl(lengthof(BuiltinTrancheNames) ==
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 16144c2b72..aaceab7771 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -357,6 +357,7 @@ SerialControl	"Waiting to read or update shared <filename>pg_serial</filename> s
 XactBuffer	"Waiting for I/O on a transaction status SLRU buffer."
 CommitTsBuffer	"Waiting for I/O on a commit timestamp SLRU buffer."
 SubtransBuffer	"Waiting for I/O on a sub-transaction SLRU buffer."
+CsnlogBuffer	"Waiting for I/O on a sub-transaction SLRU buffer."
 MultiXactOffsetBuffer	"Waiting for I/O on a multixact offset SLRU buffer."
 MultiXactMemberBuffer	"Waiting for I/O on a multixact member SLRU buffer."
 NotifyBuffer	"Waiting for I/O on a <command>NOTIFY</command> message SLRU buffer."
diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d
index d772544377..ffbfae84b8 100644
--- a/src/backend/utils/probes.d
+++ b/src/backend/utils/probes.d
@@ -77,6 +77,8 @@ provider postgresql {
 	probe clog__checkpoint__done(bool);
 	probe subtrans__checkpoint__start(bool);
 	probe subtrans__checkpoint__done(bool);
+	probe csnlog__checkpoint__start(bool);
+	probe csnlog__checkpoint__done(bool);
 	probe multixact__checkpoint__start(bool);
 	probe multixact__checkpoint__done(bool);
 	probe twophase__checkpoint__start();
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 7d2b34d4f2..da82def846 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -48,6 +48,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "access/csn_log.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
@@ -201,6 +202,7 @@ typedef struct SerializedSnapshotData
 	CommandId	curcid;
 	TimestampTz whenTaken;
 	XLogRecPtr	lsn;
+	XLogRecPtr	snapshotCsn;
 } SerializedSnapshotData;
 
 /*
@@ -1729,6 +1731,7 @@ SerializeSnapshot(Snapshot snapshot, char *start_address)
 	serialized_snapshot.curcid = snapshot->curcid;
 	serialized_snapshot.whenTaken = snapshot->whenTaken;
 	serialized_snapshot.lsn = snapshot->lsn;
+	serialized_snapshot.snapshotCsn = snapshot->snapshotCsn;
 
 	/*
 	 * Ignore the SubXID array if it has overflowed, unless the snapshot was
@@ -1803,6 +1806,7 @@ RestoreSnapshot(char *start_address)
 	snapshot->curcid = serialized_snapshot.curcid;
 	snapshot->whenTaken = serialized_snapshot.whenTaken;
 	snapshot->lsn = serialized_snapshot.lsn;
+	snapshot->snapshotCsn = serialized_snapshot.snapshotCsn;
 	snapshot->snapXactCompletionCount = 0;
 
 	/* Copy XIDs, if present. */
@@ -1913,36 +1917,11 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 	}
 	else
 	{
-		/*
-		 * In recovery we store all xids in the subxip array because it is by
-		 * far the bigger array, and we mostly don't know which xids are
-		 * top-level and which are subxacts. The xip array is empty.
-		 *
-		 * We start by searching subtrans, if we overflowed.
-		 */
-		if (snapshot->suboverflowed)
-		{
-			/*
-			 * Snapshot overflowed, so convert xid to top-level.  This is safe
-			 * because we eliminated too-old XIDs above.
-			 */
-			xid = SubTransGetTopmostTransaction(xid);
+		XLogRecPtr	csn = CSNLogGetCSNByXid(xid);
 
-			/*
-			 * If xid was indeed a subxact, we might now have an xid < xmin,
-			 * so recheck to avoid an array scan.  No point in rechecking
-			 * xmax.
-			 */
-			if (TransactionIdPrecedes(xid, snapshot->xmin))
-				return false;
-		}
-
-		/*
-		 * We now have either a top-level xid higher than xmin or an
-		 * indeterminate xid. We don't know whether it's top level or subxact
-		 * but it doesn't matter. If it's present, the xid is visible.
-		 */
-		if (pg_lfind32(xid, snapshot->subxip, snapshot->subxcnt))
+		if (csn != InvalidXLogRecPtr && csn <= snapshot->snapshotCsn)
+			return false;
+		else
 			return true;
 	}
 
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 9a91830783..dfe80eaa0d 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -249,7 +249,8 @@ static const char *const subdirs[] = {
 	"pg_xact",
 	"pg_logical",
 	"pg_logical/snapshots",
-	"pg_logical/mappings"
+	"pg_logical/mappings",
+	"pg_csn"
 };
 
 
diff --git a/src/include/access/csn_log.h b/src/include/access/csn_log.h
new file mode 100644
index 0000000000..f8cdf573ae
--- /dev/null
+++ b/src/include/access/csn_log.h
@@ -0,0 +1,30 @@
+/*
+ * csn_log.h
+ *
+ * Mapping from XID to commit record's LSN (Commit Sequence Number).
+ *
+ * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/csn_log.h
+ */
+#ifndef CSNLOG_H
+#define CSNLOG_H
+
+#include "access/xlog.h"
+#include "utils/snapshot.h"
+
+extern void CSNLogSetCSN(TransactionId xid, int nsubxids,
+						 TransactionId *subxids, XLogRecPtr csn);
+extern XLogRecPtr CSNLogGetCSNByXid(TransactionId xid);
+
+extern Size CSNLogShmemSize(void);
+extern void CSNLogShmemInit(void);
+extern void BootStrapCSNLog(void);
+extern void StartupCSNLog(TransactionId oldestActiveXID, XLogRecPtr csn);
+extern void ShutdownCSNLog(void);
+extern void CheckPointCSNLog(void);
+extern void ExtendCSNLog(TransactionId newestXact);
+extern void TruncateCSNLog(TransactionId oldestXact);
+
+#endif							/* CSNLOG_H */
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 28a2d287fd..a7054fe11c 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -238,6 +238,9 @@ typedef struct TransamVariablesData
 	FullTransactionId latestCompletedXid;	/* newest full XID that has
 											 * committed or aborted */
 
+	/* During recovery, LSN of latest replayed commit record */
+	XLogRecPtr	latestCommitLSN;
+
 	/*
 	 * Number of top-level transactions with xids (i.e. which may have
 	 * modified the database) that completed in some form since the start of
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index b85b65c604..58ed0fc038 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -47,8 +47,7 @@ extern void StartPrepare(GlobalTransaction gxact);
 extern void EndPrepare(GlobalTransaction gxact);
 extern bool StandbyTransactionIdIsPrepared(TransactionId xid);
 
-extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p,
-												 int *nxids_p);
+extern TransactionId PrescanPreparedTransactions(void);
 extern void StandbyRecoverPreparedTransactions(void);
 extern void RecoverPreparedTransactions(void);
 
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index fb64d7413a..240cbfd417 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -171,7 +171,7 @@ typedef struct SavedTransactionCharacteristics
 #define XLOG_XACT_ABORT				0x20
 #define XLOG_XACT_COMMIT_PREPARED	0x30
 #define XLOG_XACT_ABORT_PREPARED	0x40
-#define XLOG_XACT_ASSIGNMENT		0x50
+/* 0x50 is unused, was XLOG_XACT_ASSIGNMENT */
 #define XLOG_XACT_INVALIDATIONS		0x60
 /* free opcode 0x70 */
 
@@ -215,15 +215,6 @@ typedef struct SavedTransactionCharacteristics
 #define XactCompletionForceSyncCommit(xinfo) \
 	((xinfo & XACT_COMPLETION_FORCE_SYNC_COMMIT) != 0)
 
-typedef struct xl_xact_assignment
-{
-	TransactionId xtop;			/* assigned XID's top-level XID */
-	int			nsubxacts;		/* number of subtransaction XIDs */
-	TransactionId xsub[FLEXIBLE_ARRAY_MEMBER];	/* assigned subxids */
-} xl_xact_assignment;
-
-#define MinSizeOfXactAssignment offsetof(xl_xact_assignment, xsub)
-
 /*
  * Commit and abort records can contain a lot of information. But a large
  * portion of the records won't need all possible pieces of information. So we
@@ -448,7 +439,6 @@ extern FullTransactionId GetTopFullTransactionId(void);
 extern FullTransactionId GetTopFullTransactionIdIfAny(void);
 extern FullTransactionId GetCurrentFullTransactionId(void);
 extern FullTransactionId GetCurrentFullTransactionIdIfAny(void);
-extern void MarkCurrentTransactionIdLoggedIfAny(void);
 extern bool SubTransactionIsActive(SubTransactionId subxid);
 extern CommandId GetCurrentCommandId(bool used);
 extern void SetParallelStartTimestamps(TimestampTz xact_ts, TimestampTz stmt_ts);
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 20950ce033..19cb5f33bd 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -27,37 +27,10 @@ extern PGDLLIMPORT bool ignore_invalid_pages;
 extern PGDLLIMPORT bool InRecovery;
 
 /*
- * Like InRecovery, standbyState is only valid in the startup process.
- * In all other processes it will have the value STANDBY_DISABLED (so
- * InHotStandby will read as false).
- *
- * In DISABLED state, we're performing crash recovery or hot standby was
- * disabled in postgresql.conf.
- *
- * In INITIALIZED state, we've run InitRecoveryTransactionEnvironment, but
- * we haven't yet processed a RUNNING_XACTS or shutdown-checkpoint WAL record
- * to initialize our primary-transaction tracking system.
- *
- * When the transaction tracking is initialized, we enter the SNAPSHOT_PENDING
- * state. The tracked information might still be incomplete, so we can't allow
- * connections yet, but redo functions must update the in-memory state when
- * appropriate.
- *
- * In SNAPSHOT_READY mode, we have full knowledge of transactions that are
- * (or were) running on the primary at the current WAL location. Snapshots
- * can be taken, and read-only queries can be run.
+ * Like InRecovery, InHotStandby is only valid in the startup process.
+ * In all other processes it will be false.
  */
-typedef enum
-{
-	STANDBY_DISABLED,
-	STANDBY_INITIALIZED,
-	STANDBY_SNAPSHOT_PENDING,
-	STANDBY_SNAPSHOT_READY,
-} HotStandbyState;
-
-extern PGDLLIMPORT HotStandbyState standbyState;
-
-#define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING)
+extern PGDLLIMPORT bool InHotStandby;
 
 
 extern bool XLogHaveInvalidPages(void);
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index d70e6d37e0..c2156aca12 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -179,6 +179,7 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_XACT_BUFFER = NUM_INDIVIDUAL_LWLOCKS,
 	LWTRANCHE_COMMITTS_BUFFER,
 	LWTRANCHE_SUBTRANS_BUFFER,
+	LWTRANCHE_CSN_LOG_BUFFER,
 	LWTRANCHE_MULTIXACTOFFSET_BUFFER,
 	LWTRANCHE_MULTIXACTMEMBER_BUFFER,
 	LWTRANCHE_NOTIFY_BUFFER,
@@ -215,6 +216,7 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_SUBTRANS_SLRU,
 	LWTRANCHE_XACT_SLRU,
 	LWTRANCHE_PARALLEL_VACUUM_DSA,
+	LWTRANCHE_CSN_LOG_SLRU,
 	LWTRANCHE_FIRST_USER_DEFINED,
 }			BuiltinTrancheIds;
 
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index 56af0b40b3..de74fce24e 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -28,18 +28,11 @@ extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid);
 extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid);
 extern void ProcArrayClearTransaction(PGPROC *proc);
 
+extern void ProcArrayUpdateOldestRunningXid(TransactionId oldestRunningXID);
 extern void ProcArrayInitRecovery(TransactionId initializedUptoXID);
-extern void ProcArrayApplyRecoveryInfo(RunningTransactions running);
-extern void ProcArrayApplyXidAssignment(TransactionId topxid,
-										int nsubxids, TransactionId *subxids);
 
 extern void RecordKnownAssignedTransactionIds(TransactionId xid);
-extern void ExpireTreeKnownAssignedTransactionIds(TransactionId xid,
-												  int nsubxids, TransactionId *subxids,
-												  TransactionId max_xid);
-extern void ExpireAllKnownAssignedTransactionIds(void);
-extern void ExpireOldKnownAssignedTransactionIds(TransactionId xid);
-extern void KnownAssignedTransactionIdsIdleMaintenance(void);
+extern void ProcArrayRecoveryEndTransaction(TransactionId max_xid, XLogRecPtr lsn);
 
 extern int	GetMaxSnapshotXidCount(void);
 extern int	GetMaxSnapshotSubxidCount(void);
@@ -56,7 +49,7 @@ extern bool TransactionIdIsInProgress(TransactionId xid);
 extern bool TransactionIdIsActive(TransactionId xid);
 extern TransactionId GetOldestNonRemovableTransactionId(Relation rel);
 extern TransactionId GetOldestTransactionIdConsideredRunning(void);
-extern TransactionId GetOldestActiveTransactionId(void);
+extern TransactionId GetOldestActiveTransactionId(bool allDbs);
 extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
 extern void GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin);
 
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 8d1e31e888..1fda5b06f6 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -181,6 +181,13 @@ typedef struct SnapshotData
 	int32		subxcnt;		/* # of xact ids in subxip[] */
 	bool		suboverflowed;	/* has the subxip array overflowed? */
 
+	/*
+	 * MVCC snapshots taken during recovery use this CSN instead of the xip
+	 * and subxip arrays. Any transactions that committed at or before this
+	 * LSN are considered as visible.
+	 */
+	XLogRecPtr	snapshotCsn;
+
 	bool		takenDuringRecovery;	/* recovery-shaped snapshot? */
 	bool		copied;			/* false if it's a static snapshot */
 
-- 
2.39.5

